diff --git a/.github/workflows/ci_sim.yml b/.github/workflows/ci_sim.yml index f6bcf2607..a4ee0e2d0 100644 --- a/.github/workflows/ci_sim.yml +++ b/.github/workflows/ci_sim.yml @@ -210,6 +210,7 @@ jobs: rm -rf "${PYPTO_WORKSPACE}" rm -rf "${PYPTO_RUN_WORKSPACE}" rm -rf "${PTO_ISA_ROOT}" + rm -rf "${GITHUB_WORKSPACE}/.work/camodel" - name: Prepare LLVM source shell: bash @@ -453,6 +454,23 @@ jobs: run_pypto_fa_ptoas_smoke a2a3sim run_pypto_int8_codegen_smoke a2a3sim + - name: Prepare quiet camodel + shell: bash + run: | + set -euo pipefail + readarray -t camodel_candidates < <( + find "${ASCEND_HOME_PATH}" -type d -path '*/simulator/dav_3510/lib' | sort + ) + if [[ "${#camodel_candidates[@]}" -eq 0 ]]; then + echo "ERROR: cannot find dav_3510 camodel lib under ${ASCEND_HOME_PATH}" >&2 + exit 1 + fi + SIM_LIB_DIR="$(python3 scripts/prepare_quiet_camodel.py \ + --source-dir "${camodel_candidates[0]}" \ + --output-dir "${GITHUB_WORKSPACE}/.work/camodel")" + echo "SIM_LIB_DIR=${SIM_LIB_DIR}" >> "${GITHUB_ENV}" + echo "SIM_LIB_DIR=${SIM_LIB_DIR}" + - name: Run VPTO SIM validation if: ${{ true }} shell: bash diff --git a/scripts/prepare_quiet_camodel.py b/scripts/prepare_quiet_camodel.py new file mode 100755 index 000000000..9f05fe2d0 --- /dev/null +++ b/scripts/prepare_quiet_camodel.py @@ -0,0 +1,103 @@ +#!/usr/bin/env python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +"""Prepare a shared quiet camodel directory. + +The output directory is a simulator lib directory view: non-config entries are +symlinked from the original camodel lib directory, and config.json is copied and +patched to reduce simulator log/dump I/O. +""" + +import argparse +import fcntl +import json +import os +import shutil +import sys + + +def parse_args(): + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--source-dir", required=True, help="Original camodel simulator lib directory.") + parser.add_argument("--output-dir", required=True, help="Quiet camodel output directory.") + return parser.parse_args() + + +def patch_config(config_path): + with open(config_path, "r", encoding="utf-8") as handle: + config = json.load(handle) + config.setdefault("LOG", {})["flush_level"] = 6 + config.setdefault("LOG", {})["core_enable_mask"] = ["0x0"] + wrapper = config.setdefault("WRAPPER", {}) + wrapper["adapter_log_file_level"] = 6 + wrapper["aic_wrap_log_file_level"] = 6 + wrapper["cosim_log_file_level"] = 6 + wrapper["cosim_log_flush_level"] = 6 + wrapper["cosim_log_scr_level"] = 6 + with open(config_path, "w", encoding="utf-8") as handle: + json.dump(config, handle, indent=4) + handle.write("\n") + + +def prepare_quiet_camodel(source_dir, quiet_dir): + source_dir = os.path.realpath(source_dir) + if not os.path.isdir(source_dir): + raise FileNotFoundError(f"camodel source dir is invalid: {source_dir}") + + os.makedirs(quiet_dir, exist_ok=True) + lock_path = os.path.join(quiet_dir, ".quiet-camodel.lock") + with open(lock_path, "w", encoding="utf-8") as lock_file: + fcntl.flock(lock_file, fcntl.LOCK_EX) + return prepare_quiet_camodel_locked(source_dir, quiet_dir) + + +def prepare_quiet_camodel_locked(source_dir, quiet_dir): + source_marker = os.path.join(quiet_dir, ".quiet-camodel-source") + config_path = os.path.join(quiet_dir, "config.json") + if os.path.isfile(source_marker): + with open(source_marker, "r", encoding="utf-8") as handle: + existing_source = handle.read().strip() + if existing_source != source_dir: + raise RuntimeError( + f"output dir already points to {existing_source}, cannot reuse it for {source_dir}" + ) + if os.path.isfile(config_path): + return os.path.abspath(quiet_dir) + + for name in os.listdir(source_dir): + src = os.path.join(source_dir, name) + dst = os.path.join(quiet_dir, name) + if name == "config.json": + shutil.copy2(src, dst) + os.chmod(dst, os.stat(dst).st_mode | 0o200) + continue + if os.path.lexists(dst): + continue + try: + os.symlink(src, dst) + except FileExistsError: + pass + + if not os.path.isfile(config_path): + raise FileNotFoundError(f"camodel config.json not found under {source_dir}") + patch_config(config_path) + with open(source_marker, "w", encoding="utf-8") as handle: + handle.write(source_dir + "\n") + return os.path.abspath(quiet_dir) + + +def main(): + args = parse_args() + quiet_dir = os.path.abspath(args.output_dir) + print(prepare_quiet_camodel(args.source_dir, quiet_dir)) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/test/tilelang_st/npu/a5/src/st/smoke/CMakeLists.txt b/test/tilelang_st/npu/a5/src/st/smoke/CMakeLists.txt new file mode 100644 index 000000000..66f285258 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/CMakeLists.txt @@ -0,0 +1,86 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +cmake_minimum_required(VERSION 3.16) +project(tilelang_st) + +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(CMAKE_POSITION_INDEPENDENT_CODE ON) + +# CMake 3.27+ may ask the linker to emit dependency files via +# `--dependency-file`. bisheng/cce-ld does not support that flag, so disable +# linker-generated link dependencies for this standalone ST build. +if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.27) + set(CMAKE_LINK_DEPENDS_USE_LINKER FALSE) +endif() + +set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin) +set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib) + +# -------------------------------------------------------------------------- +# PTOAS binary — passed by run_st.py via -DPTOAS_BIN=... +# -------------------------------------------------------------------------- +if(NOT DEFINED PTOAS_BIN) + message(FATAL_ERROR "PTOAS_BIN is not set. Pass -DPTOAS_BIN=/path/to/ptoas to cmake.") +endif() + +# -------------------------------------------------------------------------- +# ASCEND environment +# -------------------------------------------------------------------------- +if(NOT DEFINED ENV{ASCEND_HOME_PATH}) + message(FATAL_ERROR "Cannot find ASCEND_HOME_PATH, please run set_env.sh.") +else() + set(ASCEND_HOME_PATH $ENV{ASCEND_HOME_PATH}) +endif() + +set(PTO_ISA_ROOT "${CMAKE_CURRENT_LIST_DIR}/../../../../../../../../pto-isa" CACHE PATH "Path to pto-isa repo") +set(PTO_TILELANG_ST_COMMON_DIR + "${CMAKE_CURRENT_LIST_DIR}/../common") +set(ASCEND_DRIVER_PATH /usr/local/Ascend/driver) + +set(CMAKE_COMPILER bisheng) +set(CMAKE_C_COMPILER ${CMAKE_COMPILER}) +set(CMAKE_CXX_COMPILER ${CMAKE_COMPILER}) + +add_compile_options( + -D_FORTIFY_SOURCE=2 + -O2 -std=c++17 + -Wno-macro-redefined -Wno-ignored-attributes -Wno-unknown-attributes + -fstack-protector-strong + -fPIC +) +add_link_options( + -s + -Wl,-z,relro + -Wl,-z,now +) + +set(CMAKE_CCE_COMPILE_OPTIONS + -xcce + -fPIC + -Xhost-start -Xhost-end + "SHELL:-mllvm -cce-aicore-stack-size=0x8000" + "SHELL:-mllvm -cce-aicore-function-stack-size=0x8000" + "SHELL:-mllvm -cce-aicore-record-overflow=true" + "SHELL:-mllvm -cce-aicore-addr-transform" + "SHELL:-mllvm -cce-aicore-dcci-insert-for-scalar=false" +) + +set(CMAKE_CPP_COMPILE_OPTIONS + -xc++ + "SHELL:-include stdint.h" + "SHELL:-include stddef.h" +) + +include_directories( + ${ASCEND_HOME_PATH}/include + ${ASCEND_DRIVER_PATH}/kernel/inc +) + +add_subdirectory(testcase) diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/CMakeLists.txt b/test/tilelang_st/npu/a5/src/st/smoke/testcase/CMakeLists.txt new file mode 100644 index 000000000..8b34231b9 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/CMakeLists.txt @@ -0,0 +1,217 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# -------------------------------------------------------------------------- +# pto_tilelang_st(NAME) +# +# CMake macro for TileLang ST test cases. Unlike pto-isa's pto_vec_st() +# which compiles a hand-written kernel.cpp with -xcce, this macro: +# 1. Runs ptoas to compile .pto → kernel.fatobj.o +# 2. Links the fatobj object with launch.cpp → shared library +# 3. Builds host executable from main.cpp (no GTest — comparison via compare.py) +# -------------------------------------------------------------------------- +set(PTO_TILELANG_ST_TESTCASE_DIR ${CMAKE_CURRENT_LIST_DIR}) +if(DEFINED ENV{TILELANG_ST_SIM_LIB_DIR}) + set(PTO_TILELANG_ST_SIM_LIB_DIR $ENV{TILELANG_ST_SIM_LIB_DIR}) +else() + set(PTO_TILELANG_ST_SIM_LIB_DIR ${ASCEND_HOME_PATH}/tools/simulator/${SOC_VERSION}/lib) +endif() + +function(pto_tilelang_st NAME) + set(options DISABLE_INSERT_SYNC) + set(oneValueArgs PTO_LEVEL AICORE_ARCH) + cmake_parse_arguments(PTO_TILELANG_ST "${options}" "${oneValueArgs}" "" ${ARGN}) + + set(PTOAS_ENABLE_INSERT_SYNC ON) + if(PTO_TILELANG_ST_DISABLE_INSERT_SYNC) + set(PTOAS_ENABLE_INSERT_SYNC OFF) + endif() + + set(PTOAS_PTO_LEVEL "") + if(DEFINED PTO_TILELANG_ST_PTO_LEVEL) + set(PTOAS_PTO_LEVEL "${PTO_TILELANG_ST_PTO_LEVEL}") + endif() + + set(AICORE_ARCH "dav-c310-vec") + if(DEFINED PTO_TILELANG_ST_AICORE_ARCH) + set(AICORE_ARCH "${PTO_TILELANG_ST_AICORE_ARCH}") + endif() + + # Step 1: ptoas .pto → kernel fatobj object + set(PTO_SRC ${CMAKE_CURRENT_SOURCE_DIR}/${NAME}.pto) + set(KERNEL_FATOBJ ${CMAKE_CURRENT_BINARY_DIR}/${NAME}_kernel.o) + set(PTOAS_CAPTURE_SCRIPT + ${PTO_TILELANG_ST_TESTCASE_DIR}/run_ptoas_to_file.cmake) + add_custom_command( + OUTPUT ${KERNEL_FATOBJ} + COMMAND ${CMAKE_COMMAND} + -DPTOAS_BIN=${PTOAS_BIN} + -DPTO_SRC=${PTO_SRC} + -DKERNEL_FATOBJ=${KERNEL_FATOBJ} + -DPTOAS_ENABLE_INSERT_SYNC=${PTOAS_ENABLE_INSERT_SYNC} + -DPTOAS_PTO_LEVEL=${PTOAS_PTO_LEVEL} + -P ${PTOAS_CAPTURE_SCRIPT} + DEPENDS ${PTO_SRC} ${PTOAS_CAPTURE_SCRIPT} + COMMENT "ptoas: ${NAME}.pto -> ${NAME}_kernel.o" + VERBATIM + ) + + # Step 2: link the fatobj object with launch.cpp. + add_library(${NAME}_kernel SHARED launch.cpp ${KERNEL_FATOBJ}) + set_source_files_properties(${KERNEL_FATOBJ} + PROPERTIES EXTERNAL_OBJECT TRUE GENERATED TRUE) + target_compile_options(${NAME}_kernel PRIVATE + ${CMAKE_CCE_COMPILE_OPTIONS} --cce-aicore-arch=${AICORE_ARCH} -std=c++17) + target_include_directories(${NAME}_kernel PRIVATE + ${ASCEND_HOME_PATH}/pkg_inc/ + ${ASCEND_HOME_PATH}/pkg_inc/profiling/ + ${ASCEND_HOME_PATH}/pkg_inc/runtime/runtime + ) + target_link_options(${NAME}_kernel PRIVATE --cce-fatobj-link) + + # Step 3: main.cpp → host executable + add_executable(${NAME} main.cpp) + target_compile_options(${NAME} PRIVATE ${CMAKE_CPP_COMPILE_OPTIONS}) + target_include_directories(${NAME} PRIVATE + ${PTO_TILELANG_ST_COMMON_DIR} + ) + + target_link_directories(${NAME} PUBLIC + ${ASCEND_HOME_PATH}/lib64 + ${PTO_TILELANG_ST_SIM_LIB_DIR} + ) + + target_link_libraries(${NAME} PRIVATE + ${NAME}_kernel + $:runtime_camodel>> + $:runtime>> + stdc++ ascendcl m tiling_api platform c_sec dl nnopbase pthread + ) +endfunction() + +function(pto_tilelang_vec_st NAME) + pto_tilelang_st( + ${NAME} + AICORE_ARCH dav-c310-vec + ${ARGN} + ) +endfunction() + +function(pto_tilelang_cube_st NAME) + pto_tilelang_st( + ${NAME} + AICORE_ARCH dav-c310-cube + ${ARGN} + ) +endfunction() + +# -------------------------------------------------------------------------- +# Test case registry — add new ops here. +# -------------------------------------------------------------------------- +set(ALL_TESTCASES + tadd + tsub + tmul + tdiv + tmax + tmin + tmov + tmrgsort + tshl + tshr + tand + tor + txor + tcmp + tfmod + trem + tcvt + tload + tlrelu + trelu + tsel + tsels + tcolmax + tcolmin + tcolsum + tcolprod + tcolargmax + tcolargmin + tcolexpand + tcolexpandadd + tcolexpandsub + tcolexpandmul + tcolexpanddiv + tcolexpandmax + tcolexpandmin + tcolexpandexpdif + softmax + tabs + texp + textract + textract_fp + textract_v2v + tlog + tneg + tnot + tpartmax + tpartmin + tpartadd + tpartmul + tprelu + trandom + trecip + trowargmax + trowargmin + trowsum + trowmax + trowmin + trowprod + trsqrt + tsort32 + tsqrt + trowexpand + trowexpandadd + trowexpanddiv + trowexpandexpdif + trowexpandmax + trowexpandmin + trowexpandmul + trowexpandsub + texpands + tfillpad + tfillpad_inplace + tfillpad_expand + tadds + tands + tdivs + tmaxs + tmins + tmuls + tors + tshls + tshrs + tsubs + txors + trems + tfmods + tcmps + tmatmul +) + +if((TEST_CASE IN_LIST ALL_TESTCASES) OR (TEST_CASE STREQUAL "all")) + message(STATUS "run: ${TEST_CASE}") +else() + message(FATAL_ERROR "not found TEST_CASE: ${TEST_CASE}, supported: ${ALL_TESTCASES}") +endif() + +foreach(TESTCASE ${ALL_TESTCASES}) + if((DEFINED TEST_CASE AND TEST_CASE STREQUAL TESTCASE) OR (NOT DEFINED TEST_CASE) OR (TEST_CASE STREQUAL "all")) + add_subdirectory(${TESTCASE}) + endif() +endforeach() diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/run_ptoas_to_file.cmake b/test/tilelang_st/npu/a5/src/st/smoke/testcase/run_ptoas_to_file.cmake new file mode 100644 index 000000000..b8a3a0070 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/run_ptoas_to_file.cmake @@ -0,0 +1,69 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +if(NOT DEFINED PTOAS_BIN OR NOT DEFINED PTO_SRC OR NOT DEFINED KERNEL_FATOBJ) + message(FATAL_ERROR "PTOAS_BIN, PTO_SRC, and KERNEL_FATOBJ must be provided") +endif() + +get_filename_component(KERNEL_FATOBJ_DIR "${KERNEL_FATOBJ}" DIRECTORY) +file(MAKE_DIRECTORY "${KERNEL_FATOBJ_DIR}") + +if(NOT DEFINED PTOAS_ENABLE_INSERT_SYNC) + set(PTOAS_ENABLE_INSERT_SYNC ON) +endif() + +set(PTOAS_COMMAND + "${PTOAS_BIN}" + --pto-arch=a5 +) + +if(DEFINED PTOAS_PTO_LEVEL AND NOT PTOAS_PTO_LEVEL STREQUAL "") + list(APPEND PTOAS_COMMAND "--pto-level=${PTOAS_PTO_LEVEL}") +endif() + +list(APPEND PTOAS_COMMAND --pto-backend=vpto) + +if(PTOAS_ENABLE_INSERT_SYNC) + list(APPEND PTOAS_COMMAND --enable-insert-sync) +endif() + +list(APPEND PTOAS_COMMAND + --enable-tile-op-expand + "${PTO_SRC}" + -o + "${KERNEL_FATOBJ}" +) + +execute_process( + COMMAND ${PTOAS_COMMAND} + ERROR_VARIABLE PTOAS_STDERR + RESULT_VARIABLE PTOAS_RESULT +) + +if(NOT PTOAS_RESULT EQUAL 0) + string(STRIP "${PTOAS_STDERR}" PTOAS_STDERR) + if(PTOAS_STDERR) + message(FATAL_ERROR "ptoas failed while generating ${KERNEL_FATOBJ}:\n${PTOAS_STDERR}") + endif() + message(FATAL_ERROR "ptoas failed while generating ${KERNEL_FATOBJ}") +endif() + +if(NOT EXISTS "${KERNEL_FATOBJ}") + message(FATAL_ERROR "ptoas completed without producing ${KERNEL_FATOBJ}") +endif() + +file(SIZE "${KERNEL_FATOBJ}" KERNEL_FATOBJ_SIZE) +if(KERNEL_FATOBJ_SIZE EQUAL 0) + file(REMOVE "${KERNEL_FATOBJ}") + string(STRIP "${PTOAS_STDERR}" PTOAS_STDERR) + if(PTOAS_STDERR) + message(FATAL_ERROR + "ptoas produced empty fatobj for ${PTO_SRC}:\n${PTOAS_STDERR}") + endif() + message(FATAL_ERROR "ptoas produced empty fatobj for ${PTO_SRC}") +endif() diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/softmax/CMakeLists.txt b/test/tilelang_st/npu/a5/src/st/smoke/testcase/softmax/CMakeLists.txt new file mode 100644 index 000000000..3c5224444 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/softmax/CMakeLists.txt @@ -0,0 +1,9 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +pto_tilelang_vec_st(softmax DISABLE_INSERT_SYNC PTO_LEVEL level3) diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/softmax/cases.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/softmax/cases.py new file mode 100644 index 000000000..f7f62aa00 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/softmax/cases.py @@ -0,0 +1,43 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +import numpy as np + + +CASES = [ + { + "name": "f32_rows8_seq32", + "dtype": np.float32, + "shape": (8, 128), + "valid_shape": (8, 32), + "eps": 1e-4, + "rows": 8, + "cols": 128, + "seq": 32, + "seed": 7, + }, + { + "name": "f32_rows24_seq73", + "dtype": np.float32, + "shape": (24, 128), + "valid_shape": (24, 73), + "eps": 1e-4, + "rows": 24, + "cols": 128, + "seq": 73, + "seed": 19, + }, +] + +_SMOKE_CASE_NAMES = ['f32_rows8_seq32', 'f32_rows24_seq73'] +_SMOKE_CASE_NAME_SET = set(_SMOKE_CASE_NAMES) +_missing = [name for name in _SMOKE_CASE_NAMES if name not in {case["name"] for case in CASES}] +if _missing: + raise RuntimeError("unknown smoke case(s): " + ", ".join(_missing)) +CASES = [case for case in CASES if case["name"] in _SMOKE_CASE_NAME_SET] diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/softmax/compare.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/softmax/compare.py new file mode 100644 index 000000000..6a5c89eb8 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/softmax/compare.py @@ -0,0 +1,82 @@ +#!/usr/bin/env python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +import os +import sys + +import numpy as np +from cases import CASES +from st_common import result_cmp, style_fail, style_pass, validate_cases + + +def load_array(path, dtype, shape): + if not os.path.exists(path): + raise FileNotFoundError(path) + return np.fromfile(path, dtype=dtype).reshape(shape) + + +def compare_case(case): + case_dir = case["name"] + rows = int(case["rows"]) + cols = int(case["cols"]) + seq = int(case["seq"]) + dtype = case["dtype"] + eps = case["eps"] + + try: + golden_v4 = load_array(os.path.join(case_dir, "golden_v4.bin"), dtype, (rows,)) + output_v4 = load_array(os.path.join(case_dir, "v4.bin"), dtype, (rows,)) + golden_v5 = load_array(os.path.join(case_dir, "golden_v5.bin"), dtype, (rows,)) + output_v5 = load_array(os.path.join(case_dir, "v5.bin"), dtype, (rows,)) + golden_v6 = load_array(os.path.join(case_dir, "golden_v6.bin"), dtype, (rows,)) + output_v6 = load_array(os.path.join(case_dir, "v6.bin"), dtype, (rows,)) + golden_v7 = load_array( + os.path.join(case_dir, "golden_v7.bin"), dtype, (rows, cols) + ) + output_v7 = load_array(os.path.join(case_dir, "v7.bin"), dtype, (rows, cols)) + except FileNotFoundError as exc: + print(style_fail(f"[ERROR] {case['name']}: missing file {exc}")) + return False + + ok = True + ok = result_cmp(golden_v4, output_v4, eps) and ok + ok = result_cmp(golden_v5, output_v5, eps) and ok + ok = result_cmp(golden_v6, output_v6, eps) and ok + ok = result_cmp(golden_v7[:, :seq], output_v7[:, :seq], eps) and ok + return ok + + +def main(): + validate_cases(CASES) + case_filter = sys.argv[1] if len(sys.argv) > 1 else None + + all_passed = True + matched_case = case_filter is None + for case in CASES: + if case_filter is not None and case["name"] != case_filter: + continue + + matched_case = True + ok = compare_case(case) + if ok: + print(style_pass(f"[INFO] {case['name']}: compare passed")) + else: + print(style_fail(f"[ERROR] {case['name']}: compare failed")) + all_passed = False + + if not matched_case: + print(style_fail(f"[ERROR] unknown case filter: {case_filter}")) + sys.exit(2) + if not all_passed: + sys.exit(2) + print(style_pass("[INFO] all cases passed")) + + +if __name__ == "__main__": + main() diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/softmax/gen_data.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/softmax/gen_data.py new file mode 100644 index 000000000..05bcef759 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/softmax/gen_data.py @@ -0,0 +1,64 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +import numpy as np + +from cases import CASES +from st_common import save_case_data, validate_cases + + +validate_cases(CASES) + +for case in CASES: + rows = int(case["rows"]) + cols = int(case["cols"]) + seq = int(case["seq"]) + seed = int(case["seed"]) + + rng = np.random.default_rng(seed) + oldmax = rng.uniform(-3.0, 1.5, size=(rows,)).astype(np.float32) + oldsum = rng.uniform(0.5, 4.0, size=(rows,)).astype(np.float32) + qk = rng.normal(loc=0.0, scale=1.5, size=(rows, cols)).astype(np.float32) + + qk_active = qk[:, :seq] + qk_rowmax = np.max(qk_active, axis=1) + newmax = np.maximum(qk_rowmax, oldmax) + tmp_active = np.exp(qk_active - newmax[:, None], dtype=np.float32) + cursum = np.sum(tmp_active, axis=1, dtype=np.float32) + raw_expmax = np.exp(oldmax - newmax, dtype=np.float32) + newsum = raw_expmax * oldsum + cursum + expmax = (raw_expmax * oldsum) / newsum + out = np.zeros((rows, cols), dtype=np.float32) + out[:, :seq] = tmp_active / newsum[:, None] + + zeros_state = np.zeros((rows,), dtype=np.float32) + zeros_out = np.zeros((rows, cols), dtype=np.float32) + + save_case_data( + case["name"], + { + "v1": oldmax, + "v2": oldsum, + "v3": qk.reshape(-1), + "v4": zeros_state, + "v5": zeros_state, + "v6": zeros_state, + "v7": zeros_out.reshape(-1), + "v8": np.array([seq], dtype=np.int32), + "v9": np.array([rows], dtype=np.int32), + "golden_v4": newmax, + "golden_v5": newsum, + "golden_v6": expmax, + "golden_v7": out.reshape(-1), + }, + ) + print( + f"[INFO] gen_data: {case['name']} rows={rows} cols={cols} " + f"seq={seq} dtype={case['dtype'].__name__}" + ) diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/softmax/launch.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/softmax/launch.cpp new file mode 100644 index 000000000..f7af162fa --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/softmax/launch.cpp @@ -0,0 +1,40 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +#include + +#ifndef AICORE +#define AICORE [aicore] +#endif + +extern "C" __global__ AICORE void online_softmax_update_kernel_2d(__gm__ float *v1, __gm__ float *v2, __gm__ float *v3, __gm__ float *v4, __gm__ float *v5, __gm__ float *v6, __gm__ float *v7, int32_t v8, int32_t v9); + +void LaunchSOFTMAX_f32_rows24_seq73(float *v1, float *v2, float *v3, + float *v4, float *v5, float *v6, + float *v7, int32_t v8, int32_t v9, + void *stream) { + const int32_t blockRows = 8; + const int32_t blocks = (v9 + blockRows - 1) / blockRows; + online_softmax_update_kernel_2d<<>>( + (__gm__ float *)v1, (__gm__ float *)v2, (__gm__ float *)v3, + (__gm__ float *)v4, (__gm__ float *)v5, (__gm__ float *)v6, + (__gm__ float *)v7, v8, v9); +} + + +void LaunchSOFTMAX_f32_rows8_seq32(float *v1, float *v2, float *v3, + float *v4, float *v5, float *v6, + float *v7, int32_t v8, int32_t v9, + void *stream) { + const int32_t blockRows = 8; + const int32_t blocks = (v9 + blockRows - 1) / blockRows; + online_softmax_update_kernel_2d<<>>( + (__gm__ float *)v1, (__gm__ float *)v2, (__gm__ float *)v3, + (__gm__ float *)v4, (__gm__ float *)v5, (__gm__ float *)v6, + (__gm__ float *)v7, v8, v9); +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/softmax/main.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/softmax/main.cpp new file mode 100644 index 000000000..72f1ebd5e --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/softmax/main.cpp @@ -0,0 +1,202 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +#include "test_common.h" +#include "acl/acl.h" +#include +#include +#include +#include +#include + +using namespace PtoTestCommon; + +#ifndef TMRGSORT_HPP +namespace pto { +struct MrgSortExecutedNumList { + uint16_t mrgSortList0; + uint16_t mrgSortList1; + uint16_t mrgSortList2; + uint16_t mrgSortList3; +}; +} // namespace pto +#endif + +#define ACL_CHECK(expr) \ + do { \ + const aclError _ret = (expr); \ + if (_ret != ACL_SUCCESS) { \ + std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, (int)_ret, __FILE__, __LINE__); \ + const char *_recent = aclGetRecentErrMsg(); \ + if (_recent != nullptr && _recent[0] != '\0') \ + std::fprintf(stderr, "[ERROR] RecentErrMsg: %s\n", _recent); \ + rc = 1; \ + goto cleanup; \ + } \ + } while (0) + +void LaunchSOFTMAX_f32_rows24_seq73(float *v1, float *v2, float *v3, + float *v4, float *v5, float *v6, + float *v7, int32_t v8, int32_t v9, + void *stream); +void LaunchSOFTMAX_f32_rows8_seq32(float *v1, float *v2, float *v3, + float *v4, float *v5, float *v6, + float *v7, int32_t v8, int32_t v9, + void *stream); + +using LaunchFn = void (*)(float *, float *, float *, float *, float *, float *, + float *, int32_t, int32_t, void *); + +struct TestCase { + const char *name; + LaunchFn launch; + size_t rows; + size_t cols; +}; + +static const TestCase kCases[] = { +{"f32_rows8_seq32", LaunchSOFTMAX_f32_rows8_seq32, 8, 128}, +{"f32_rows24_seq73", LaunchSOFTMAX_f32_rows24_seq73, 24, 128}, +}; +static constexpr size_t kNumCases = sizeof(kCases) / sizeof(kCases[0]); + +static int RunCase(const TestCase &tc, aclrtStream stream) { + const size_t scalarBytes = sizeof(int32_t); + const size_t stateElems = tc.rows; + const size_t outElems = tc.rows * tc.cols; + const size_t stateBytes = stateElems * sizeof(float); + const size_t outBytes = outElems * sizeof(float); + std::string caseDir = std::string("./") + tc.name; + + float *v1Host = nullptr, *v2Host = nullptr, *v3Host = nullptr; + float *v4Host = nullptr, *v5Host = nullptr, *v6Host = nullptr, *v7Host = nullptr; + float *v1Device = nullptr, *v2Device = nullptr, *v3Device = nullptr; + float *v4Device = nullptr, *v5Device = nullptr, *v6Device = nullptr, *v7Device = nullptr; + int32_t seqHost = 0; + int32_t rowsHost = 0; + size_t fileSize = 0; + int rc = 0; + + std::printf("[INFO] === case: %s (rows=%zu, cols=%zu) ===\n", + tc.name, tc.rows, tc.cols); + + if (!ReadFile(caseDir + "/v8.bin", fileSize, &seqHost, scalarBytes) || + !ReadFile(caseDir + "/v9.bin", fileSize, &rowsHost, scalarBytes)) { + std::fprintf(stderr, "[ERROR] failed to read scalar inputs for %s\n", tc.name); + return 1; + } + + ACL_CHECK(aclrtMallocHost((void **)(&v1Host), stateBytes)); + ACL_CHECK(aclrtMallocHost((void **)(&v2Host), stateBytes)); + ACL_CHECK(aclrtMallocHost((void **)(&v3Host), outBytes)); + ACL_CHECK(aclrtMallocHost((void **)(&v4Host), stateBytes)); + ACL_CHECK(aclrtMallocHost((void **)(&v5Host), stateBytes)); + ACL_CHECK(aclrtMallocHost((void **)(&v6Host), stateBytes)); + ACL_CHECK(aclrtMallocHost((void **)(&v7Host), outBytes)); + + ACL_CHECK(aclrtMalloc((void **)&v1Device, stateBytes, ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc((void **)&v2Device, stateBytes, ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc((void **)&v3Device, outBytes, ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc((void **)&v4Device, stateBytes, ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc((void **)&v5Device, stateBytes, ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc((void **)&v6Device, stateBytes, ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc((void **)&v7Device, outBytes, ACL_MEM_MALLOC_HUGE_FIRST)); + + if (!ReadFile(caseDir + "/v1.bin", fileSize, v1Host, stateBytes) || + !ReadFile(caseDir + "/v2.bin", fileSize, v2Host, stateBytes) || + !ReadFile(caseDir + "/v3.bin", fileSize, v3Host, outBytes) || + !ReadFile(caseDir + "/v4.bin", fileSize, v4Host, stateBytes) || + !ReadFile(caseDir + "/v5.bin", fileSize, v5Host, stateBytes) || + !ReadFile(caseDir + "/v6.bin", fileSize, v6Host, stateBytes) || + !ReadFile(caseDir + "/v7.bin", fileSize, v7Host, outBytes)) { + std::fprintf(stderr, "[ERROR] failed to read tensor inputs for %s\n", tc.name); + rc = 1; + goto cleanup; + } + + ACL_CHECK(aclrtMemcpy(v1Device, stateBytes, v1Host, stateBytes, ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(v2Device, stateBytes, v2Host, stateBytes, ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(v3Device, outBytes, v3Host, outBytes, ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(v4Device, stateBytes, v4Host, stateBytes, ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(v5Device, stateBytes, v5Host, stateBytes, ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(v6Device, stateBytes, v6Host, stateBytes, ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(v7Device, outBytes, v7Host, outBytes, ACL_MEMCPY_HOST_TO_DEVICE)); + + tc.launch(v1Device, v2Device, v3Device, v4Device, v5Device, v6Device, + v7Device, seqHost, rowsHost, stream); + + ACL_CHECK(aclrtSynchronizeStream(stream)); + ACL_CHECK(aclrtMemcpy(v4Host, stateBytes, v4Device, stateBytes, ACL_MEMCPY_DEVICE_TO_HOST)); + ACL_CHECK(aclrtMemcpy(v5Host, stateBytes, v5Device, stateBytes, ACL_MEMCPY_DEVICE_TO_HOST)); + ACL_CHECK(aclrtMemcpy(v6Host, stateBytes, v6Device, stateBytes, ACL_MEMCPY_DEVICE_TO_HOST)); + ACL_CHECK(aclrtMemcpy(v7Host, outBytes, v7Device, outBytes, ACL_MEMCPY_DEVICE_TO_HOST)); + + if (!WriteFile(caseDir + "/v4.bin", v4Host, stateBytes) || + !WriteFile(caseDir + "/v5.bin", v5Host, stateBytes) || + !WriteFile(caseDir + "/v6.bin", v6Host, stateBytes) || + !WriteFile(caseDir + "/v7.bin", v7Host, outBytes)) { + std::fprintf(stderr, "[ERROR] failed to write outputs for %s\n", tc.name); + rc = 1; + } + +cleanup: + if (v1Device != nullptr) aclrtFree(v1Device); + if (v2Device != nullptr) aclrtFree(v2Device); + if (v3Device != nullptr) aclrtFree(v3Device); + if (v4Device != nullptr) aclrtFree(v4Device); + if (v5Device != nullptr) aclrtFree(v5Device); + if (v6Device != nullptr) aclrtFree(v6Device); + if (v7Device != nullptr) aclrtFree(v7Device); + if (v1Host != nullptr) aclrtFreeHost(v1Host); + if (v2Host != nullptr) aclrtFreeHost(v2Host); + if (v3Host != nullptr) aclrtFreeHost(v3Host); + if (v4Host != nullptr) aclrtFreeHost(v4Host); + if (v5Host != nullptr) aclrtFreeHost(v5Host); + if (v6Host != nullptr) aclrtFreeHost(v6Host); + if (v7Host != nullptr) aclrtFreeHost(v7Host); + + if (rc == 0) + std::printf("[INFO] case %s done\n", tc.name); + return rc; +} + +int main(int argc, char *argv[]) { + const char *caseFilter = (argc > 1) ? argv[1] : nullptr; + bool matchedCase = (caseFilter == nullptr); + int rc = 0; + int deviceId = 0; + aclrtStream stream = nullptr; + + aclInit(nullptr); + if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) + deviceId = std::atoi(envDevice); + aclrtSetDevice(deviceId); + aclrtCreateStream(&stream); + + for (size_t i = 0; i < kNumCases; ++i) { + if (caseFilter != nullptr && std::strcmp(kCases[i].name, caseFilter) != 0) + continue; + matchedCase = true; + if (RunCase(kCases[i], stream) != 0) { + std::fprintf(stderr, "[ERROR] case %s failed\n", kCases[i].name); + rc = 1; + break; + } + } + + if (!matchedCase) { + std::fprintf(stderr, "[ERROR] unknown case filter: %s\n", caseFilter); + rc = 1; + } + + if (stream != nullptr) + aclrtDestroyStream(stream); + aclrtResetDevice(deviceId); + aclFinalize(); + return rc; +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/softmax/softmax.pto b/test/tilelang_st/npu/a5/src/st/smoke/testcase/softmax/softmax.pto new file mode 100644 index 000000000..c6acdbe40 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/softmax/softmax.pto @@ -0,0 +1,238 @@ +// TileLang ST kernel for online softmax update with mixed pto.tload/pto.tstore +// and raw VPTO vecscope compute. +// This testcase keeps manual sync in the source, so ST compilation disables +// --enable-insert-sync and enables --pto-level=level3 for alloc_tile addr=. + +module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { + func.func @online_softmax_update_kernel_2d(%arg0: !pto.ptr, + %arg1: !pto.ptr, + %arg2: !pto.ptr, + %arg3: !pto.ptr, + %arg4: !pto.ptr, + %arg5: !pto.ptr, + %arg6: !pto.ptr, + %arg7: i32, + %arg8: i32) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c8 = arith.constant 8 : index + %c64 = arith.constant 64 : index + %c128 = arith.constant 128 : index + + %c0_i64 = arith.constant 0 : i64 + %c1_i64 = arith.constant 1 : i64 + %c8_i64 = arith.constant 8 : i64 + %c16_i64 = arith.constant 16 : i64 + %c32_i64 = arith.constant 32 : i64 + %c64_i64 = arith.constant 64 : i64 + %c128_i64 = arith.constant 128 : i64 + %c256_i64 = arith.constant 256 : i64 + %c512_i64 = arith.constant 512 : i64 + %c8448_i64 = arith.constant 8448 : i64 + %c16640_i64 = arith.constant 16640 : i64 + %c16768_i64 = arith.constant 16768 : i64 + %c16896_i64 = arith.constant 16896 : i64 + + %c1_i32 = arith.constant 1 : i32 + %c8_i32 = arith.constant 8 : i32 + %c64_i32 = arith.constant 64 : i32 + %c0_i32 = arith.constant 0 : i32 + + %block = pto.get_block_idx + %block_idx = arith.index_cast %block : i64 to index + %row_base = arith.muli %block_idx, %c8 : index + %block_rows_i32 = arith.index_cast %c8 : index to i32 + %row_base_i32 = arith.index_cast %row_base : index to i32 + %remaining_rows = arith.subi %arg8, %row_base_i32 : i32 + %has_rows = arith.cmpi sgt, %remaining_rows, %c0_i32 : i32 + %too_many_rows = arith.cmpi sgt, %remaining_rows, %c8_i32 : i32 + %row_count_i32 = arith.select %too_many_rows, %c8_i32, %remaining_rows : i32 + %row_count = arith.index_cast %row_count_i32 : i32 to index + %seq = arith.index_cast %arg7 : i32 to index + %rows = arith.index_cast %arg8 : i32 to index + %rows_x_128 = arith.muli %rows, %c128 : index + + scf.if %has_rows { + %oldmax_view = pto.make_tensor_view %arg0, + shape = [%c1, %c1, %c1, %rows, %c1], + strides = [%rows, %rows, %rows, %c1, %rows] + : !pto.tensor_view + %oldsum_view = pto.make_tensor_view %arg1, + shape = [%c1, %c1, %c1, %rows, %c1], + strides = [%rows, %rows, %rows, %c1, %rows] + : !pto.tensor_view + %qk_view = pto.make_tensor_view %arg2, + shape = [%c1, %c1, %c1, %rows, %c128], + strides = [%rows_x_128, %rows_x_128, %rows_x_128, %c128, %c1] + : !pto.tensor_view + %newmax_view = pto.make_tensor_view %arg3, + shape = [%c1, %c1, %c1, %rows, %c1], + strides = [%rows, %rows, %rows, %c1, %rows] + : !pto.tensor_view + %newsum_view = pto.make_tensor_view %arg4, + shape = [%c1, %c1, %c1, %rows, %c1], + strides = [%rows, %rows, %rows, %c1, %rows] + : !pto.tensor_view + %expmax_view = pto.make_tensor_view %arg5, + shape = [%c1, %c1, %c1, %rows, %c1], + strides = [%rows, %rows, %rows, %c1, %rows] + : !pto.tensor_view + %out_view = pto.make_tensor_view %arg6, + shape = [%c1, %c1, %c1, %rows, %c128], + strides = [%rows_x_128, %rows_x_128, %rows_x_128, %c128, %c1] + : !pto.tensor_view + + %oldmax_part = pto.partition_view %oldmax_view, + offsets = [%c0, %c0, %c0, %row_base, %c0], + sizes = [%c1, %c1, %c1, %row_count, %c1] + : !pto.tensor_view -> !pto.partition_tensor_view + %oldsum_part = pto.partition_view %oldsum_view, + offsets = [%c0, %c0, %c0, %row_base, %c0], + sizes = [%c1, %c1, %c1, %row_count, %c1] + : !pto.tensor_view -> !pto.partition_tensor_view + %qk_part = pto.partition_view %qk_view, + offsets = [%c0, %c0, %c0, %row_base, %c0], + sizes = [%c1, %c1, %c1, %row_count, %seq] + : !pto.tensor_view -> !pto.partition_tensor_view + %newmax_part = pto.partition_view %newmax_view, + offsets = [%c0, %c0, %c0, %row_base, %c0], + sizes = [%c1, %c1, %c1, %row_count, %c1] + : !pto.tensor_view -> !pto.partition_tensor_view + %newsum_part = pto.partition_view %newsum_view, + offsets = [%c0, %c0, %c0, %row_base, %c0], + sizes = [%c1, %c1, %c1, %row_count, %c1] + : !pto.tensor_view -> !pto.partition_tensor_view + %expmax_part = pto.partition_view %expmax_view, + offsets = [%c0, %c0, %c0, %row_base, %c0], + sizes = [%c1, %c1, %c1, %row_count, %c1] + : !pto.tensor_view -> !pto.partition_tensor_view + %out_part = pto.partition_view %out_view, + offsets = [%c0, %c0, %c0, %row_base, %c0], + sizes = [%c1, %c1, %c1, %row_count, %seq] + : !pto.tensor_view -> !pto.partition_tensor_view + + // Tile domain: alloc_tile creates UB tile handles; tload/tstore operate + // on tile_buf values before/after the vector scope compute region. + %oldmax_tile = pto.alloc_tile addr = %c0_i64 valid_row = %row_count + : !pto.tile_buf + %oldsum_tile = pto.alloc_tile addr = %c128_i64 valid_row = %row_count + : !pto.tile_buf + %qk_tile = pto.alloc_tile addr = %c256_i64 valid_row = %row_count valid_col = %seq + : !pto.tile_buf + %out_tile = pto.alloc_tile addr = %c8448_i64 valid_row = %row_count valid_col = %seq + : !pto.tile_buf + %newmax_tile = pto.alloc_tile addr = %c16640_i64 valid_row = %row_count + : !pto.tile_buf + %newsum_tile = pto.alloc_tile addr = %c16768_i64 valid_row = %row_count + : !pto.tile_buf + %expmax_tile = pto.alloc_tile addr = %c16896_i64 valid_row = %row_count + : !pto.tile_buf + + pto.tload ins(%oldmax_part : !pto.partition_tensor_view) + outs(%oldmax_tile : !pto.tile_buf) + pto.tload ins(%oldsum_part : !pto.partition_tensor_view) + outs(%oldsum_tile : !pto.tile_buf) + pto.tload ins(%qk_part : !pto.partition_tensor_view) + outs(%qk_tile : !pto.tile_buf) + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + // Boundary into vecscope instructions: tile_buf_addr materializes UB + // pointers from tile handles so vecscope can use vlds/vsts. + %ub_oldmax = pto.tile_buf_addr %oldmax_tile + : !pto.tile_buf + -> !pto.ptr + %ub_oldsum = pto.tile_buf_addr %oldsum_tile + : !pto.tile_buf + -> !pto.ptr + %ub_qk = pto.tile_buf_addr %qk_tile + : !pto.tile_buf + -> !pto.ptr + %ub_out = pto.tile_buf_addr %out_tile + : !pto.tile_buf + -> !pto.ptr + %ub_newmax = pto.tile_buf_addr %newmax_tile + : !pto.tile_buf + -> !pto.ptr + %ub_newsum = pto.tile_buf_addr %newsum_tile + : !pto.tile_buf + -> !pto.ptr + %ub_expmax = pto.tile_buf_addr %expmax_tile + : !pto.tile_buf + -> !pto.ptr + %active = pto.pset_b32 "PAT_ALL" : !pto.mask + %one_mask, %one_remaining = pto.plt_b32 %c1_i32 : i32 -> !pto.mask, i32 + scf.for %row = %c0 to %row_count step %c1 { + %row_qk = arith.muli %row, %c128 : index + %oldmax_bc = pto.vlds %ub_oldmax[%row] {dist = "BRC_B32"} : !pto.ptr -> !pto.vreg<64xf32> + %oldsum_bc = pto.vlds %ub_oldsum[%row] {dist = "BRC_B32"} : !pto.ptr -> !pto.vreg<64xf32> + + %final_max, %final_sum = scf.for %chunk = %c0 to %c128 step %c64 + iter_args(%running_max = %oldmax_bc, %running_sum = %oldsum_bc) + -> (!pto.vreg<64xf32>, !pto.vreg<64xf32>) { + %chunk_i32 = arith.index_cast %chunk : index to i32 + %remaining_cols = arith.subi %arg7, %chunk_i32 : i32 + %has_chunk = arith.cmpi sgt, %remaining_cols, %c0_i32 : i32 + %next_max, %next_sum = scf.if %has_chunk -> (!pto.vreg<64xf32>, !pto.vreg<64xf32>) { + %chunk_mask, %chunk_rest = pto.plt_b32 %remaining_cols : i32 -> !pto.mask, i32 + %chunk_base = arith.addi %row_qk, %chunk : index + %vec = pto.vlds %ub_qk[%chunk_base] : !pto.ptr -> !pto.vreg<64xf32> + %chunk_max = pto.vcmax %vec, %chunk_mask : !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> + %chunk_max_bc = pto.vdup %chunk_max, %active {position = "LOWEST"} : !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> + %merged_max = pto.vmax %running_max, %chunk_max_bc, %active : !pto.vreg<64xf32>, !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> + %scaled_running = pto.vexpdif %running_max, %merged_max, %active, "ODD" : !pto.vreg<64xf32>, !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> + %running_sum_scaled = pto.vmul %scaled_running, %running_sum, %active : !pto.vreg<64xf32>, !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> + %chunk_exp = pto.vexpdif %vec, %merged_max, %chunk_mask, "ODD" : !pto.vreg<64xf32>, !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> + %chunk_sum = pto.vcadd %chunk_exp, %chunk_mask : !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> + %chunk_sum_bc = pto.vdup %chunk_sum, %active {position = "LOWEST"} : !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> + %merged_sum = pto.vadd %running_sum_scaled, %chunk_sum_bc, %active : !pto.vreg<64xf32>, !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> + scf.yield %merged_max, %merged_sum : !pto.vreg<64xf32>, !pto.vreg<64xf32> + } else { + scf.yield %running_max, %running_sum : !pto.vreg<64xf32>, !pto.vreg<64xf32> + } + scf.yield %next_max, %next_sum : !pto.vreg<64xf32>, !pto.vreg<64xf32> + } + + %raw_expmax = pto.vexpdif %oldmax_bc, %final_max, %active, "ODD" : !pto.vreg<64xf32>, !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> + %scaled_oldsum = pto.vmul %raw_expmax, %oldsum_bc, %active : !pto.vreg<64xf32>, !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> + %expmax = pto.vdiv %scaled_oldsum, %final_sum, %active : !pto.vreg<64xf32>, !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> + pto.vsts %final_max, %ub_newmax[%row], %one_mask {dist = "1PT_B32"} : !pto.vreg<64xf32>, !pto.ptr, !pto.mask + pto.vsts %final_sum, %ub_newsum[%row], %one_mask {dist = "1PT_B32"} : !pto.vreg<64xf32>, !pto.ptr, !pto.mask + pto.vsts %expmax, %ub_expmax[%row], %one_mask {dist = "1PT_B32"} : !pto.vreg<64xf32>, !pto.ptr, !pto.mask + + scf.for %chunk = %c0 to %c128 step %c64 { + %chunk_i32 = arith.index_cast %chunk : index to i32 + %remaining_cols = arith.subi %arg7, %chunk_i32 : i32 + %has_chunk = arith.cmpi sgt, %remaining_cols, %c0_i32 : i32 + scf.if %has_chunk { + %chunk_mask, %chunk_rest = pto.plt_b32 %remaining_cols : i32 -> !pto.mask, i32 + %chunk_base = arith.addi %row_qk, %chunk : index + %vec = pto.vlds %ub_qk[%chunk_base] : !pto.ptr -> !pto.vreg<64xf32> + %exp = pto.vexpdif %vec, %final_max, %chunk_mask, "ODD" : !pto.vreg<64xf32>, !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> + %out = pto.vdiv %exp, %final_sum, %chunk_mask : !pto.vreg<64xf32>, !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> + pto.vsts %out, %ub_out[%chunk_base], %chunk_mask : !pto.vreg<64xf32>, !pto.ptr, !pto.mask + } + } + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + + // Back in the tile domain: tstore writes the tile_buf results to GM + // partitions after the VPTO vecscope finishes. + pto.tstore ins(%newmax_tile : !pto.tile_buf) + outs(%newmax_part : !pto.partition_tensor_view) + pto.tstore ins(%newsum_tile : !pto.tile_buf) + outs(%newsum_part : !pto.partition_tensor_view) + pto.tstore ins(%expmax_tile : !pto.tile_buf) + outs(%expmax_part : !pto.partition_tensor_view) + pto.tstore ins(%out_tile : !pto.tile_buf) + outs(%out_part : !pto.partition_tensor_view) + } + pto.barrier #pto.pipe + return + } +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/st_common.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/st_common.py new file mode 100644 index 000000000..d0401b202 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/st_common.py @@ -0,0 +1,143 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +"""Shared utilities for TileLang ST test cases. + +Provides: + - Data helpers: setup_case_rng(), save_case_data() + - Compare: result_cmp() + - Styling: supports_color(), style_pass(), style_fail() +""" + +import os +import sys +import numpy as np + + +# --------------------------------------------------------------------------- +# Case helpers +# --------------------------------------------------------------------------- + +REQUIRED_CASE_KEYS = {"name", "dtype", "shape", "valid_shape", "eps"} + + +def _to_shape_tuple(shape): + if not isinstance(shape, (tuple, list)): + raise ValueError(f"shape must be tuple/list, got {type(shape).__name__}: {shape!r}") + if not shape: + raise ValueError("shape must not be empty") + dims = tuple(int(dim) for dim in shape) + if any(dim <= 0 for dim in dims): + raise ValueError(f"shape dimensions must be > 0, got {dims}") + return dims + + +def _validate_shape_pair(shape, valid_shape, label): + shape = _to_shape_tuple(shape) + valid_shape = _to_shape_tuple(valid_shape) + if len(shape) != len(valid_shape): + raise ValueError(f"{label}: shape rank mismatch: {shape} vs {valid_shape}") + if any(valid_dim > dim for dim, valid_dim in zip(shape, valid_shape)): + raise ValueError(f"{label}: valid shape {valid_shape} exceeds shape {shape}") + return shape, valid_shape + + +def validate_cases(cases): + """Check that every case has all required keys.""" + for i, case in enumerate(cases): + missing = REQUIRED_CASE_KEYS - case.keys() + if missing: + raise ValueError(f"cases[{i}] ({case.get('name', '?')}) missing keys: {missing}") + _validate_shape_pair(case["shape"], case["valid_shape"], "shape") + has_dst_shape = "dst_shape" in case + has_dst_valid_shape = "dst_valid_shape" in case + if has_dst_shape != has_dst_valid_shape: + raise ValueError( + f"cases[{i}] ({case.get('name', '?')}) must define both dst_shape and dst_valid_shape" + ) + if has_dst_shape: + _validate_shape_pair(case["dst_shape"], case["dst_valid_shape"], "dst") + + +# --------------------------------------------------------------------------- +# Data generation helpers +# --------------------------------------------------------------------------- + +def setup_case_rng(case): + """Set a per-case deterministic random seed. + + Using hash(name) ensures that adding/reordering cases does not change + the random data of existing cases. + """ + np.random.seed(hash(case["name"]) & 0xFFFFFFFF) + + +def save_case_data(case_name, data_dict): + """Create case directory and write {name}.bin for each entry in data_dict. + + Args: + case_name: subdirectory name (e.g. "f32_16x64"). + data_dict: mapping from file stem to numpy array, + e.g. {"input1": arr1, "input2": arr2, "golden": golden}. + """ + os.makedirs(case_name, exist_ok=True) + for name, arr in data_dict.items(): + arr.tofile(os.path.join(case_name, f"{name}.bin")) + + +# --------------------------------------------------------------------------- +# Terminal styling +# --------------------------------------------------------------------------- + +ANSI_RESET = "\033[0m" +ANSI_BOLD_GREEN = "\033[1;32m" +ANSI_BOLD_RED = "\033[1;31m" + + +def supports_color(): + return sys.stdout.isatty() and os.environ.get("TERM") not in (None, "", "dumb") + + +def style_pass(text): + if not supports_color(): + return text + return f"{ANSI_BOLD_GREEN}{text}{ANSI_RESET}" + + +def style_fail(text): + if not supports_color(): + return text + return f"{ANSI_BOLD_RED}{text}{ANSI_RESET}" + + +# --------------------------------------------------------------------------- +# Comparison +# --------------------------------------------------------------------------- + +def result_cmp(golden, output, eps): + """Compare already prepared golden/output arrays. + + The caller is responsible for loading, reshaping and slicing data. + """ + g = np.asarray(golden).astype(np.float64, copy=False) + o = np.asarray(output).astype(np.float64, copy=False) + + if g.shape != o.shape: + print(style_fail(f"[ERROR] Shape mismatch: golden {g.shape} vs output {o.shape}")) + return False + if not np.allclose(g, o, atol=eps, rtol=eps, equal_nan=True): + abs_diff = np.abs(g - o) + idx = int(np.argmax(abs_diff)) + print(style_fail(f"[ERROR] Mismatch: max diff={float(abs_diff.flat[idx])} " + f"at flat idx={idx} " + f"(golden={g.flat[idx]}, output={o.flat[idx]})")) + return False + return True diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tabs/CMakeLists.txt b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tabs/CMakeLists.txt new file mode 100644 index 000000000..b776efb52 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tabs/CMakeLists.txt @@ -0,0 +1,9 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +pto_tilelang_vec_st(tabs) diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tabs/cases.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tabs/cases.py new file mode 100644 index 000000000..85016ec6a --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tabs/cases.py @@ -0,0 +1,62 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +"""Single source of truth for tabs ST test cases. + +Each case defines: + - name: case identifier, used as subdirectory name and by main.cpp kCases[]. + - dtype: numpy dtype (e.g. np.float32). + - shape: (rows, cols) — allocated tile dimensions. + - valid_shape: (valid_rows, valid_cols) — effective computation region. + - eps: tolerance for numpy.allclose (atol and rtol). + +gen_data.py and compare.py both import this list to avoid redundant definitions. +""" + +import numpy as np + +CASES = [ + { + "name": "f32_16x64", + "dtype": np.float32, + "shape": (16, 64), + "valid_shape": (16, 64), + "eps": 1e-6, + }, + { + "name": "f32_32x32", + "dtype": np.float32, + "shape": (32, 32), + "valid_shape": (32, 32), + "eps": 1e-6, + }, + { + "name": "f16_16x64", + "dtype": np.float16, + "shape": (16, 64), + "valid_shape": (16, 64), + "eps": 1e-3, + }, + { + "name": "f16_32x32", + "dtype": np.float16, + "shape": (32, 32), + "valid_shape": (32, 32), + "eps": 1e-3, + }, +] + +_SMOKE_CASE_NAMES = ['f32_16x64', 'f16_16x64'] +_SMOKE_CASE_NAME_SET = set(_SMOKE_CASE_NAMES) +_missing = [name for name in _SMOKE_CASE_NAMES if name not in {case["name"] for case in CASES}] +if _missing: + raise RuntimeError("unknown smoke case(s): " + ", ".join(_missing)) +CASES = [case for case in CASES if case["name"] in _SMOKE_CASE_NAME_SET] diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tabs/compare.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tabs/compare.py new file mode 100644 index 000000000..428604929 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tabs/compare.py @@ -0,0 +1,49 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import os +import sys +import numpy as np + +from cases import CASES +from st_common import result_cmp, style_fail, style_pass, validate_cases + + +def main(): + validate_cases(CASES) + case_filter = sys.argv[1] if len(sys.argv) > 1 else None + + all_passed = True + for case in CASES: + if case_filter is not None and case["name"] != case_filter: + continue + + case_dir = case["name"] + shape = case["shape"] + vr, vc = case["valid_shape"] + + golden = np.fromfile(os.path.join(case_dir, "golden.bin"), dtype=case["dtype"]).reshape(shape) + output = np.fromfile(os.path.join(case_dir, "output.bin"), dtype=case["dtype"]).reshape(shape) + + ok = result_cmp(golden[:vr, :vc], output[:vr, :vc], case["eps"]) + if ok: + print(style_pass(f"[INFO] {case['name']}: compare passed")) + else: + print(style_fail(f"[ERROR] {case['name']}: compare failed")) + all_passed = False + + if not all_passed: + sys.exit(2) + print(style_pass("[INFO] all cases passed")) + + +if __name__ == "__main__": + main() diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tabs/gen_data.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tabs/gen_data.py new file mode 100644 index 000000000..22bf5d95d --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tabs/gen_data.py @@ -0,0 +1,32 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import numpy as np +from cases import CASES +from st_common import validate_cases, setup_case_rng, save_case_data + +validate_cases(CASES) + +for case in CASES: + setup_case_rng(case) + + dtype = case["dtype"] + shape = case["shape"] + valid_shape = case["valid_shape"] + + input = np.random.randn(*shape).astype(dtype) + + golden = np.zeros(shape, dtype=dtype) + vr, vc = valid_shape + golden[:vr, :vc] = np.abs(input[:vr, :vc]).astype(dtype, copy=False) + + save_case_data(case["name"], {"input": input, "golden": golden}) + print(f"[INFO] gen_data: {case['name']} shape={shape} valid_shape={valid_shape} dtype={dtype.__name__}") diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tabs/launch.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tabs/launch.cpp new file mode 100644 index 000000000..73e4bd300 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tabs/launch.cpp @@ -0,0 +1,28 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +#include + +#ifndef AICORE +#define AICORE [aicore] +#endif + +// Case 0: f32 16x64 + +extern "C" __global__ AICORE void TABS_f32_16x64(__gm__ float *a, __gm__ float *b); +extern "C" __global__ AICORE void TABS_f16_16x64(__gm__ uint16_t *a, __gm__ uint16_t *b); + +void LaunchTABS_f32_16x64(void *a, void *b, void *stream) { + TABS_f32_16x64<<<1, nullptr, stream>>>((__gm__ float *)a, (__gm__ float *)b); +} + + + +void LaunchTABS_f16_16x64(void *a, void *b, void *stream) { + TABS_f16_16x64<<<1, nullptr, stream>>>((__gm__ uint16_t *)a, (__gm__ uint16_t *)b); +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tabs/main.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tabs/main.cpp new file mode 100644 index 000000000..17f9db23a --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tabs/main.cpp @@ -0,0 +1,135 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// Host driver for TileLang tabs ST — case-table driven. +// Each case launches a different kernel variant, reads/writes from per-case subdirectory. +// Numerical comparison is done externally by compare.py. + +#include "acl/acl.h" +#include "test_common.h" +#include +#include +#include +#include +#include +#include + +using namespace PtoTestCommon; + +// Kernel launch wrappers (defined in launch.cpp) +void LaunchTABS_f32_16x64(void *a, void *b, void *stream); +void LaunchTABS_f32_32x32(void *a, void *b, void *stream); +void LaunchTABS_f16_16x64(void *a, void *b, void *stream); +void LaunchTABS_f16_32x32(void *a, void *b, void *stream); + +using LaunchFn = void (*)(void *, void *, void *); + +struct TestCase { + const char *name; + LaunchFn launch; + size_t rows; // allocated tile rows + size_t cols; // allocated tile cols + size_t validRows; // effective computation rows (<= rows) + size_t validCols; // effective computation cols (<= cols) + size_t elemSize; // bytes per element +}; + +static const TestCase kCases[] = { +{"f32_16x64", LaunchTABS_f32_16x64, 16, 64, 16, 64, sizeof(float)}, +{"f16_16x64", LaunchTABS_f16_16x64, 16, 64, 16, 64, sizeof(uint16_t)}, +}; +static constexpr size_t kNumCases = sizeof(kCases) / sizeof(kCases[0]); + +static int RunCase(const TestCase &tc, int deviceId, aclrtStream stream) { + int rc = 0; + const size_t elemCount = tc.rows * tc.cols; + const size_t fileSize = elemCount * tc.elemSize; + + std::printf("[INFO] === case: %s (shape=%zux%zu, valid=%zux%zu) ===\n", + tc.name, tc.rows, tc.cols, tc.validRows, tc.validCols); + + // Per-case data directory + std::string caseDir = std::string("./") + tc.name; + size_t srcFileSize = fileSize; + + void *srcHost = nullptr, *dstHost = nullptr; + void *srcDevice = nullptr, *dstDevice = nullptr; + + aclrtMallocHost((void **)(&srcHost), fileSize); + aclrtMallocHost((void **)(&dstHost), fileSize); + + aclrtMalloc((void **)&srcDevice, fileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc((void **)&dstDevice, fileSize, ACL_MEM_MALLOC_HUGE_FIRST); + + if (!ReadFile((caseDir + "/input.bin").c_str(), srcFileSize, srcHost, fileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input.bin\n", caseDir.c_str()); + rc = 1; + } + + if (rc == 0) { + aclrtMemcpy(srcDevice, fileSize, srcHost, fileSize, ACL_MEMCPY_HOST_TO_DEVICE); + + tc.launch(srcDevice, dstDevice, stream); + + aclrtSynchronizeStream(stream); + aclrtMemcpy(dstHost, fileSize, dstDevice, fileSize, ACL_MEMCPY_DEVICE_TO_HOST); + } + + if (rc == 0 && !WriteFile((caseDir + "/output.bin").c_str(), dstHost, fileSize)) { + std::fprintf(stderr, "[ERROR] failed to write %s/output.bin\n", caseDir.c_str()); + rc = 1; + } + + if (srcDevice != nullptr) + aclrtFree(srcDevice); + if (dstDevice != nullptr) + aclrtFree(dstDevice); + if (srcHost != nullptr) + aclrtFreeHost(srcHost); + if (dstHost != nullptr) + aclrtFreeHost(dstHost); + + if (rc == 0) + std::printf("[INFO] case %s done\n", tc.name); + return rc; +} + +int main(int argc, char *argv[]) { + // Optional case filter: ./tabs [case_name] + const char *caseFilter = (argc > 1) ? argv[1] : nullptr; + + int rc = 0; + int deviceId = 0; + aclrtStream stream = nullptr; + + aclInit(nullptr); + if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { + deviceId = std::atoi(envDevice); + } + aclrtSetDevice(deviceId); + aclrtCreateStream(&stream); + + for (size_t i = 0; i < kNumCases; ++i) { + if (caseFilter != nullptr && std::strcmp(kCases[i].name, caseFilter) != 0) { + continue; + } + int ret = RunCase(kCases[i], deviceId, stream); + if (ret != 0) { + std::fprintf(stderr, "[ERROR] case %s failed\n", kCases[i].name); + rc = 1; + break; + } + } + + if (stream != nullptr) + aclrtDestroyStream(stream); + aclrtResetDevice(deviceId); + aclFinalize(); + + return rc; +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tabs/tabs.pto b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tabs/tabs.pto new file mode 100644 index 000000000..111c1eea4 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tabs/tabs.pto @@ -0,0 +1,101 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// TileLang ST kernels for pto.tabs: tload(a) + tabs(a)->b + tstore(b). +// Multiple cases with different shapes in a single module. +// Compiled by ptoas --enable-insert-sync --enable-tile-op-expand --pto-backend=vpto +// to produce a fatobj object. + +module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { + // Case 0: f32 16x64 (1024 elements) + func.func @TABS_f32_16x64(%a_ptr: !pto.ptr, %b_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c16 = arith.constant 16 : index + %c64 = arith.constant 64 : index + %c1024 = arith.constant 1024 : index + + %a_view = pto.make_tensor_view %a_ptr, + shape = [%c1, %c1, %c1, %c16, %c64], + strides = [%c1024, %c1024, %c1024, %c64, %c1] + : !pto.tensor_view<1x1x1x16x64xf32> + %b_view = pto.make_tensor_view %b_ptr, + shape = [%c1, %c1, %c1, %c16, %c64], + strides = [%c1024, %c1024, %c1024, %c64, %c1] + : !pto.tensor_view<1x1x1x16x64xf32> + + %a_part = pto.partition_view %a_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c16, %c64] + : !pto.tensor_view<1x1x1x16x64xf32> -> !pto.partition_tensor_view<1x1x1x16x64xf32> + %b_part = pto.partition_view %b_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c16, %c64] + : !pto.tensor_view<1x1x1x16x64xf32> -> !pto.partition_tensor_view<1x1x1x16x64xf32> + + %a = pto.alloc_tile + : !pto.tile_buf + %b = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%a_part : !pto.partition_tensor_view<1x1x1x16x64xf32>) + outs(%a : !pto.tile_buf) + + pto.tabs ins(%a : !pto.tile_buf) + outs(%b : !pto.tile_buf) + + pto.tstore ins(%b : !pto.tile_buf) + outs(%b_part : !pto.partition_tensor_view<1x1x1x16x64xf32>) + return + } + + // Case 1: f32 32x32 (1024 elements) + + func.func @TABS_f16_16x64(%a_ptr: !pto.ptr, %b_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c16 = arith.constant 16 : index + %c64 = arith.constant 64 : index + %c1024 = arith.constant 1024 : index + + %a_view = pto.make_tensor_view %a_ptr, + shape = [%c1, %c1, %c1, %c16, %c64], + strides = [%c1024, %c1024, %c1024, %c64, %c1] + : !pto.tensor_view<1x1x1x16x64xf16> + %b_view = pto.make_tensor_view %b_ptr, + shape = [%c1, %c1, %c1, %c16, %c64], + strides = [%c1024, %c1024, %c1024, %c64, %c1] + : !pto.tensor_view<1x1x1x16x64xf16> + + %a_part = pto.partition_view %a_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c16, %c64] + : !pto.tensor_view<1x1x1x16x64xf16> -> !pto.partition_tensor_view<1x1x1x16x64xf16> + %b_part = pto.partition_view %b_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c16, %c64] + : !pto.tensor_view<1x1x1x16x64xf16> -> !pto.partition_tensor_view<1x1x1x16x64xf16> + + %a = pto.alloc_tile + : !pto.tile_buf + %b = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%a_part : !pto.partition_tensor_view<1x1x1x16x64xf16>) + outs(%a : !pto.tile_buf) + + pto.tabs ins(%a : !pto.tile_buf) + outs(%b : !pto.tile_buf) + + pto.tstore ins(%b : !pto.tile_buf) + outs(%b_part : !pto.partition_tensor_view<1x1x1x16x64xf16>) + return + } + + // Case 3: f16 32x32 (1024 elements) +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tadd/CMakeLists.txt b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tadd/CMakeLists.txt new file mode 100644 index 000000000..84928bcdb --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tadd/CMakeLists.txt @@ -0,0 +1,9 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +pto_tilelang_vec_st(tadd) diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tadd/cases.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tadd/cases.py new file mode 100644 index 000000000..7ecbf8dfb --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tadd/cases.py @@ -0,0 +1,47 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +"""Single source of truth for tadd ST test cases. + +Each case defines: + - name: case identifier, used as subdirectory name and by main.cpp kCases[]. + - dtype: numpy dtype (e.g. np.float32). + - shape: (rows, cols) — allocated tile dimensions. + - valid_shape: (valid_rows, valid_cols) — effective computation region. + - eps: tolerance for numpy.allclose (atol and rtol). + +gen_data.py and compare.py both import this list to avoid redundant definitions. +""" + +import numpy as np + +CASES = [ + { + "name": "f32_16x64", + "dtype": np.float32, + "shape": (16, 64), + "valid_shape": (16, 64), + "eps": 1e-6, + }, + { + "name": "f32_32x32", + "dtype": np.float32, + "shape": (32, 32), + "valid_shape": (32, 32), + "eps": 1e-6, + }, +] + +_SMOKE_CASE_NAMES = ['f32_16x64', 'f32_32x32'] +_SMOKE_CASE_NAME_SET = set(_SMOKE_CASE_NAMES) +_missing = [name for name in _SMOKE_CASE_NAMES if name not in {case["name"] for case in CASES}] +if _missing: + raise RuntimeError("unknown smoke case(s): " + ", ".join(_missing)) +CASES = [case for case in CASES if case["name"] in _SMOKE_CASE_NAME_SET] diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tadd/compare.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tadd/compare.py new file mode 100644 index 000000000..6a4d5d1aa --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tadd/compare.py @@ -0,0 +1,48 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import os +import sys +import numpy as np + +from cases import CASES +from st_common import result_cmp, style_fail, style_pass, validate_cases + + +def main(): + validate_cases(CASES) + case_filter = sys.argv[1] if len(sys.argv) > 1 else None + + all_passed = True + for case in CASES: + if case_filter is not None and case["name"] != case_filter: + continue + + case_dir = case["name"] + shape = case["shape"] + vr, vc = case["valid_shape"] + + golden = np.fromfile(os.path.join(case_dir, "golden.bin"), dtype=case["dtype"]).reshape(shape) + output = np.fromfile(os.path.join(case_dir, "output.bin"), dtype=case["dtype"]).reshape(shape) + + ok = result_cmp(golden[:vr, :vc], output[:vr, :vc], case["eps"]) + if ok: + print(style_pass(f"[INFO] {case['name']}: compare passed")) + else: + print(style_fail(f"[ERROR] {case['name']}: compare failed")) + all_passed = False + + if not all_passed: + sys.exit(2) + print(style_pass("[INFO] all cases passed")) + + +if __name__ == "__main__": + main() diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tadd/gen_data.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tadd/gen_data.py new file mode 100644 index 000000000..986dba17d --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tadd/gen_data.py @@ -0,0 +1,32 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import numpy as np +from cases import CASES +from st_common import validate_cases, setup_case_rng, save_case_data + +validate_cases(CASES) + +for case in CASES: + setup_case_rng(case) + + dtype = case["dtype"] + shape = case["shape"] + valid_shape = case["valid_shape"] + + input1 = np.random.randint(1, 10, size=shape).astype(dtype) + input2 = np.random.randint(1, 10, size=shape).astype(dtype) + + golden = np.zeros(shape, dtype=dtype) + vr, vc = valid_shape + golden[:vr, :vc] = (input1[:vr, :vc] + input2[:vr, :vc]).astype(dtype, copy=False) + + save_case_data(case["name"], {"input1": input1, "input2": input2, "golden": golden}) + print(f"[INFO] gen_data: {case['name']} shape={shape} valid_shape={valid_shape} dtype={dtype.__name__}") diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tadd/launch.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tadd/launch.cpp new file mode 100644 index 000000000..88847da80 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tadd/launch.cpp @@ -0,0 +1,27 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +#include + +#ifndef AICORE +#define AICORE [aicore] +#endif + +// Case 0: f32 16x64 + +extern "C" __global__ AICORE void TADD_f32_16x64(__gm__ float *a, __gm__ float *b, __gm__ float *c); +extern "C" __global__ AICORE void TADD_f32_32x32(__gm__ float *a, __gm__ float *b, __gm__ float *c); + +void LaunchTADD_f32_32x32(float *a, float *b, float *c, void *stream) { + TADD_f32_32x32<<<1, nullptr, stream>>>((__gm__ float *)a, (__gm__ float *)b, (__gm__ float *)c); +} + + +void LaunchTADD_f32_16x64(float *a, float *b, float *c, void *stream) { + TADD_f32_16x64<<<1, nullptr, stream>>>((__gm__ float *)a, (__gm__ float *)b, (__gm__ float *)c); +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tadd/main.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tadd/main.cpp new file mode 100644 index 000000000..207924715 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tadd/main.cpp @@ -0,0 +1,145 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// Host driver for TileLang tadd ST — case-table driven. +// Each case launches a different kernel variant, reads/writes from per-case subdirectory. +// Numerical comparison is done externally by compare.py. + +#include "acl/acl.h" +#include "test_common.h" +#include +#include +#include +#include +#include +#include + +using namespace PtoTestCommon; + +// Kernel launch wrappers (defined in launch.cpp) +void LaunchTADD_f32_16x64(float *a, float *b, float *c, void *stream); +void LaunchTADD_f32_32x32(float *a, float *b, float *c, void *stream); + +using LaunchFn = void (*)(float *, float *, float *, void *); + +struct TestCase { + const char *name; + LaunchFn launch; + size_t rows; // allocated tile rows + size_t cols; // allocated tile cols + size_t validRows; // effective computation rows (<= rows) + size_t validCols; // effective computation cols (<= cols) + size_t elemSize; // bytes per element +}; + +static const TestCase kCases[] = { +{"f32_16x64", LaunchTADD_f32_16x64, 16, 64, 16, 64, sizeof(float)}, +{"f32_32x32", LaunchTADD_f32_32x32, 32, 32, 32, 32, sizeof(float)}, +}; +static constexpr size_t kNumCases = sizeof(kCases) / sizeof(kCases[0]); + +static int RunCase(const TestCase &tc, int deviceId, aclrtStream stream) { + int rc = 0; + const size_t elemCount = tc.rows * tc.cols; + const size_t fileSize = elemCount * tc.elemSize; + + std::printf("[INFO] === case: %s (shape=%zux%zu, valid=%zux%zu) ===\n", + tc.name, tc.rows, tc.cols, tc.validRows, tc.validCols); + + // Per-case data directory + std::string caseDir = std::string("./") + tc.name; + size_t src0FileSize = fileSize; + size_t src1FileSize = fileSize; + + float *src0Host = nullptr, *src1Host = nullptr, *dstHost = nullptr; + float *src0Device = nullptr, *src1Device = nullptr, *dstDevice = nullptr; + + aclrtMallocHost((void **)(&src0Host), fileSize); + aclrtMallocHost((void **)(&src1Host), fileSize); + aclrtMallocHost((void **)(&dstHost), fileSize); + + aclrtMalloc((void **)&src0Device, fileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc((void **)&src1Device, fileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc((void **)&dstDevice, fileSize, ACL_MEM_MALLOC_HUGE_FIRST); + + if (!ReadFile((caseDir + "/input1.bin").c_str(), src0FileSize, src0Host, fileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input1.bin\n", caseDir.c_str()); + rc = 1; + } + if (rc == 0 && !ReadFile((caseDir + "/input2.bin").c_str(), src1FileSize, src1Host, fileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input2.bin\n", caseDir.c_str()); + rc = 1; + } + + if (rc == 0) { + aclrtMemcpy(src0Device, fileSize, src0Host, fileSize, ACL_MEMCPY_HOST_TO_DEVICE); + aclrtMemcpy(src1Device, fileSize, src1Host, fileSize, ACL_MEMCPY_HOST_TO_DEVICE); + + tc.launch(src0Device, src1Device, dstDevice, stream); + + aclrtSynchronizeStream(stream); + aclrtMemcpy(dstHost, fileSize, dstDevice, fileSize, ACL_MEMCPY_DEVICE_TO_HOST); + } + + if (rc == 0 && !WriteFile((caseDir + "/output.bin").c_str(), dstHost, fileSize)) { + std::fprintf(stderr, "[ERROR] failed to write %s/output.bin\n", caseDir.c_str()); + rc = 1; + } + + if (src0Device != nullptr) + aclrtFree(src0Device); + if (src1Device != nullptr) + aclrtFree(src1Device); + if (dstDevice != nullptr) + aclrtFree(dstDevice); + if (src0Host != nullptr) + aclrtFreeHost(src0Host); + if (src1Host != nullptr) + aclrtFreeHost(src1Host); + if (dstHost != nullptr) + aclrtFreeHost(dstHost); + + if (rc == 0) + std::printf("[INFO] case %s done\n", tc.name); + return rc; +} + +int main(int argc, char *argv[]) { + // Optional case filter: ./tadd [case_name] + const char *caseFilter = (argc > 1) ? argv[1] : nullptr; + + int rc = 0; + int deviceId = 0; + aclrtStream stream = nullptr; + + aclInit(nullptr); + if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { + deviceId = std::atoi(envDevice); + } + aclrtSetDevice(deviceId); + aclrtCreateStream(&stream); + + for (size_t i = 0; i < kNumCases; ++i) { + if (caseFilter != nullptr && std::strcmp(kCases[i].name, caseFilter) != 0) { + continue; + } + int ret = RunCase(kCases[i], deviceId, stream); + if (ret != 0) { + std::fprintf(stderr, "[ERROR] case %s failed\n", kCases[i].name); + rc = 1; + break; + } + } + + if (stream != nullptr) + aclrtDestroyStream(stream); + aclrtResetDevice(deviceId); + aclFinalize(); + + return rc; +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tadd/tadd.pto b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tadd/tadd.pto new file mode 100644 index 000000000..d433d34c9 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tadd/tadd.pto @@ -0,0 +1,124 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// TileLang ST kernels for pto.tadd: tload(a) + tload(b) + tadd(a,b)->c + tstore(c). +// Multiple cases with different shapes in a single module. +// Compiled by ptoas --enable-insert-sync --enable-tile-op-expand --pto-backend=vpto +// to produce a fatobj object. + +module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { + // Case 0: f32 16x64 (1024 elements) + func.func @TADD_f32_16x64(%a_ptr: !pto.ptr, %b_ptr: !pto.ptr, %c_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c16 = arith.constant 16 : index + %c64 = arith.constant 64 : index + %c1024 = arith.constant 1024 : index + + %a_view = pto.make_tensor_view %a_ptr, + shape = [%c1, %c1, %c1, %c16, %c64], + strides = [%c1024, %c1024, %c1024, %c64, %c1] + : !pto.tensor_view<1x1x1x16x64xf32> + %b_view = pto.make_tensor_view %b_ptr, + shape = [%c1, %c1, %c1, %c16, %c64], + strides = [%c1024, %c1024, %c1024, %c64, %c1] + : !pto.tensor_view<1x1x1x16x64xf32> + %c_view = pto.make_tensor_view %c_ptr, + shape = [%c1, %c1, %c1, %c16, %c64], + strides = [%c1024, %c1024, %c1024, %c64, %c1] + : !pto.tensor_view<1x1x1x16x64xf32> + + %a_part = pto.partition_view %a_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c16, %c64] + : !pto.tensor_view<1x1x1x16x64xf32> -> !pto.partition_tensor_view<1x1x1x16x64xf32> + %b_part = pto.partition_view %b_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c16, %c64] + : !pto.tensor_view<1x1x1x16x64xf32> -> !pto.partition_tensor_view<1x1x1x16x64xf32> + %c_part = pto.partition_view %c_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c16, %c64] + : !pto.tensor_view<1x1x1x16x64xf32> -> !pto.partition_tensor_view<1x1x1x16x64xf32> + + %a = pto.alloc_tile + : !pto.tile_buf + %b = pto.alloc_tile + : !pto.tile_buf + %c = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%a_part : !pto.partition_tensor_view<1x1x1x16x64xf32>) + outs(%a : !pto.tile_buf) + pto.tload ins(%b_part : !pto.partition_tensor_view<1x1x1x16x64xf32>) + outs(%b : !pto.tile_buf) + + pto.tadd ins(%a, %b : !pto.tile_buf, + !pto.tile_buf) + outs(%c : !pto.tile_buf) + + pto.tstore ins(%c : !pto.tile_buf) + outs(%c_part : !pto.partition_tensor_view<1x1x1x16x64xf32>) + return + } + + // Case 1: f32 32x32 (1024 elements) + + func.func @TADD_f32_32x32(%a_ptr: !pto.ptr, %b_ptr: !pto.ptr, %c_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c32 = arith.constant 32 : index + %c1024 = arith.constant 1024 : index + + %a_view = pto.make_tensor_view %a_ptr, + shape = [%c1, %c1, %c1, %c32, %c32], + strides = [%c1024, %c1024, %c1024, %c32, %c1] + : !pto.tensor_view<1x1x1x32x32xf32> + %b_view = pto.make_tensor_view %b_ptr, + shape = [%c1, %c1, %c1, %c32, %c32], + strides = [%c1024, %c1024, %c1024, %c32, %c1] + : !pto.tensor_view<1x1x1x32x32xf32> + %c_view = pto.make_tensor_view %c_ptr, + shape = [%c1, %c1, %c1, %c32, %c32], + strides = [%c1024, %c1024, %c1024, %c32, %c1] + : !pto.tensor_view<1x1x1x32x32xf32> + + %a_part = pto.partition_view %a_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c32, %c32] + : !pto.tensor_view<1x1x1x32x32xf32> -> !pto.partition_tensor_view<1x1x1x32x32xf32> + %b_part = pto.partition_view %b_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c32, %c32] + : !pto.tensor_view<1x1x1x32x32xf32> -> !pto.partition_tensor_view<1x1x1x32x32xf32> + %c_part = pto.partition_view %c_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c32, %c32] + : !pto.tensor_view<1x1x1x32x32xf32> -> !pto.partition_tensor_view<1x1x1x32x32xf32> + + %a = pto.alloc_tile + : !pto.tile_buf + %b = pto.alloc_tile + : !pto.tile_buf + %c = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%a_part : !pto.partition_tensor_view<1x1x1x32x32xf32>) + outs(%a : !pto.tile_buf) + pto.tload ins(%b_part : !pto.partition_tensor_view<1x1x1x32x32xf32>) + outs(%b : !pto.tile_buf) + + pto.tadd ins(%a, %b : !pto.tile_buf, + !pto.tile_buf) + outs(%c : !pto.tile_buf) + + pto.tstore ins(%c : !pto.tile_buf) + outs(%c_part : !pto.partition_tensor_view<1x1x1x32x32xf32>) + return + } +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tadds/CMakeLists.txt b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tadds/CMakeLists.txt new file mode 100644 index 000000000..d4535a569 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tadds/CMakeLists.txt @@ -0,0 +1,9 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +pto_tilelang_vec_st(tadds) diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tadds/cases.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tadds/cases.py new file mode 100644 index 000000000..b17cb1ff7 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tadds/cases.py @@ -0,0 +1,76 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +"""Single source of truth for tadds ST test cases. + +Shapes and dtypes match testcase/tadds (C++ GTest suite): + case1: float, 32x64, valid 32x64 + case2: float16, 63x64, valid 63x64 + case3: int32, 31x128, valid 31x128 + case4: int16, 15x192, valid 15x192 + case5: float, 7x448, valid 7x448 + case6: float, 256x16, valid 256x16 + +gen_data.py and compare.py both import this list to avoid redundant definitions. +""" + +import numpy as np + +CASES = [ + { + "name": "f32_32x64", + "dtype": np.float32, + "shape": (32, 64), + "valid_shape": (32, 64), + "eps": 1e-6, + }, + { + "name": "f16_63x64", + "dtype": np.float16, + "shape": (63, 64), + "valid_shape": (63, 64), + "eps": 1e-3, + }, + { + "name": "i32_31x128", + "dtype": np.int32, + "shape": (31, 128), + "valid_shape": (31, 128), + "eps": 0, + }, + { + "name": "i16_15x192", + "dtype": np.int16, + "shape": (15, 192), + "valid_shape": (15, 192), + "eps": 0, + }, + { + "name": "f32_7x448", + "dtype": np.float32, + "shape": (7, 448), + "valid_shape": (7, 448), + "eps": 1e-6, + }, + { + "name": "f32_256x16", + "dtype": np.float32, + "shape": (256, 16), + "valid_shape": (256, 16), + "eps": 1e-6, + }, +] + +_SMOKE_CASE_NAMES = ['f32_32x64', 'i16_15x192'] +_SMOKE_CASE_NAME_SET = set(_SMOKE_CASE_NAMES) +_missing = [name for name in _SMOKE_CASE_NAMES if name not in {case["name"] for case in CASES}] +if _missing: + raise RuntimeError("unknown smoke case(s): " + ", ".join(_missing)) +CASES = [case for case in CASES if case["name"] in _SMOKE_CASE_NAME_SET] diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tadds/compare.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tadds/compare.py new file mode 100644 index 000000000..50186777e --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tadds/compare.py @@ -0,0 +1,46 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import os +import sys +import numpy as np + +from cases import CASES +from st_common import result_cmp, style_fail, style_pass, validate_cases + +def main(): + validate_cases(CASES) + case_filter = sys.argv[1] if len(sys.argv) > 1 else None + + all_passed = True + for case in CASES: + if case_filter is not None and case["name"] != case_filter: + continue + + case_dir = case["name"] + shape = case["shape"] + vr, vc = case["valid_shape"] + + golden = np.fromfile(os.path.join(case_dir, "golden.bin"), dtype=case["dtype"]).reshape(shape) + output = np.fromfile(os.path.join(case_dir, "output.bin"), dtype=case["dtype"]).reshape(shape) + + ok = result_cmp(golden[:vr, :vc], output[:vr, :vc], case["eps"]) + if ok: + print(style_pass(f"[INFO] {case['name']}: compare passed")) + else: + print(style_fail(f"[ERROR] {case['name']}: compare failed")) + all_passed = False + + if not all_passed: + sys.exit(2) + print(style_pass("[INFO] all cases passed")) + +if __name__ == "__main__": + main() diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tadds/gen_data.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tadds/gen_data.py new file mode 100644 index 000000000..c4f47c5f4 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tadds/gen_data.py @@ -0,0 +1,35 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import numpy as np +from cases import CASES +from st_common import validate_cases, setup_case_rng, save_case_data + +# Scalar value added to every element (matches the scalar passed in launch.cpp) +SCALAR = 3.0 + +validate_cases(CASES) + +for case in CASES: + setup_case_rng(case) + + dtype = case["dtype"] + shape = case["shape"] + valid_shape = case["valid_shape"] + + input1 = np.random.randint(1, 10, size=shape).astype(dtype) + + golden = np.zeros(shape, dtype=dtype) + vr, vc = valid_shape + scalar_val = dtype(SCALAR) + golden[:vr, :vc] = (input1[:vr, :vc] + scalar_val).astype(dtype, copy=False) + + save_case_data(case["name"], {"input1": input1, "golden": golden}) + print(f"[INFO] gen_data: {case['name']} shape={shape} valid_shape={valid_shape} dtype={dtype.__name__} scalar={SCALAR}") diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tadds/launch.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tadds/launch.cpp new file mode 100644 index 000000000..03e7dd128 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tadds/launch.cpp @@ -0,0 +1,31 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +#include + +#ifndef AICORE +#define AICORE [aicore] +#endif + +// Scalar value added to every element (must match gen_data.py SCALAR) +static constexpr float TADDS_SCALAR_F32 = 3.0f; + +// Case 0: f32 32x64 + +extern "C" __global__ AICORE void TADDS_f32_32x64(__gm__ float *src, __gm__ float *dst, float scalar); +extern "C" __global__ AICORE void TADDS_i16_15x192(__gm__ int16_t *src, __gm__ int16_t *dst, int16_t scalar); + +void LaunchTADDS_i16_15x192(int16_t *src, int16_t *dst, void *stream) { + TADDS_i16_15x192<<<1, nullptr, stream>>>((__gm__ int16_t *)src, (__gm__ int16_t *)dst, (int16_t)3); +} + + + +void LaunchTADDS_f32_32x64(float *src, float *dst, void *stream) { + TADDS_f32_32x64<<<1, nullptr, stream>>>((__gm__ float *)src, (__gm__ float *)dst, TADDS_SCALAR_F32); +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tadds/main.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tadds/main.cpp new file mode 100644 index 000000000..fcfae5548 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tadds/main.cpp @@ -0,0 +1,133 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// Host driver for TileLang tadds ST — case-table driven. +// tadds: dst = src + scalar (single input + scalar, unlike tadd which has two inputs). +// Numerical comparison is done externally by compare.py. + +#include "acl/acl.h" +#include "test_common.h" +#include +#include +#include +#include +#include +#include + +using namespace PtoTestCommon; + +// Kernel launch wrappers (defined in launch.cpp) +void LaunchTADDS_f32_32x64(float *src, float *dst, void *stream); +void LaunchTADDS_f16_63x64(uint16_t *src, uint16_t *dst, void *stream); +void LaunchTADDS_i16_15x192(int16_t *src, int16_t *dst, void *stream); +void LaunchTADDS_f32_256x16(float *src, float *dst, void *stream); + +struct TestCase { + const char *name; + void (*launch)(void *, void *, void *); // src, dst, stream + size_t rows; // allocated tile rows + size_t cols; // allocated tile cols + size_t validRows; // effective computation rows (<= rows) + size_t validCols; // effective computation cols (<= cols) + size_t elemSize; // bytes per element +}; + +static const TestCase kCases[] = { +{"f32_32x64", (void (*)(void*,void*,void*))LaunchTADDS_f32_32x64, 32, 64, 32, 64, sizeof(float)}, +{"i16_15x192", (void (*)(void*,void*,void*))LaunchTADDS_i16_15x192, 15, 192, 15, 192, sizeof(int16_t)}, +}; +static constexpr size_t kNumCases = sizeof(kCases) / sizeof(kCases[0]); + +static int RunCase(const TestCase &tc, int deviceId, aclrtStream stream) { + int rc = 0; + const size_t elemCount = tc.rows * tc.cols; + const size_t fileSize = elemCount * tc.elemSize; + + std::printf("[INFO] === case: %s (shape=%zux%zu, valid=%zux%zu) ===\n", + tc.name, tc.rows, tc.cols, tc.validRows, tc.validCols); + + // Per-case data directory + std::string caseDir = std::string("./") + tc.name; + size_t srcFileSize = fileSize; + + void *srcHost = nullptr, *dstHost = nullptr; + void *srcDevice = nullptr, *dstDevice = nullptr; + + aclrtMallocHost(&srcHost, fileSize); + aclrtMallocHost(&dstHost, fileSize); + + aclrtMalloc(&srcDevice, fileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc(&dstDevice, fileSize, ACL_MEM_MALLOC_HUGE_FIRST); + + if (!ReadFile((caseDir + "/input1.bin").c_str(), srcFileSize, srcHost, fileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input1.bin\n", caseDir.c_str()); + rc = 1; + } + + if (rc == 0) { + aclrtMemcpy(srcDevice, fileSize, srcHost, fileSize, ACL_MEMCPY_HOST_TO_DEVICE); + + tc.launch(srcDevice, dstDevice, stream); + + aclrtSynchronizeStream(stream); + aclrtMemcpy(dstHost, fileSize, dstDevice, fileSize, ACL_MEMCPY_DEVICE_TO_HOST); + } + + if (rc == 0 && !WriteFile((caseDir + "/output.bin").c_str(), dstHost, fileSize)) { + std::fprintf(stderr, "[ERROR] failed to write %s/output.bin\n", caseDir.c_str()); + rc = 1; + } + + if (srcDevice != nullptr) + aclrtFree(srcDevice); + if (dstDevice != nullptr) + aclrtFree(dstDevice); + if (srcHost != nullptr) + aclrtFreeHost(srcHost); + if (dstHost != nullptr) + aclrtFreeHost(dstHost); + + if (rc == 0) + std::printf("[INFO] case %s done\n", tc.name); + return rc; +} + +int main(int argc, char *argv[]) { + // Optional case filter: ./tadds [case_name] + const char *caseFilter = (argc > 1) ? argv[1] : nullptr; + + int rc = 0; + int deviceId = 0; + aclrtStream stream = nullptr; + + aclInit(nullptr); + if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { + deviceId = std::atoi(envDevice); + } + aclrtSetDevice(deviceId); + aclrtCreateStream(&stream); + + for (size_t i = 0; i < kNumCases; ++i) { + if (caseFilter != nullptr && std::strcmp(kCases[i].name, caseFilter) != 0) { + continue; + } + int ret = RunCase(kCases[i], deviceId, stream); + if (ret != 0) { + std::fprintf(stderr, "[ERROR] case %s failed\n", kCases[i].name); + rc = 1; + break; + } + } + + if (stream != nullptr) + aclrtDestroyStream(stream); + aclrtResetDevice(deviceId); + aclFinalize(); + + return rc; +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tadds/tadds.pto b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tadds/tadds.pto new file mode 100644 index 000000000..f397670e9 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tadds/tadds.pto @@ -0,0 +1,98 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// TileLang ST kernels for pto.tadds: tload(src) + tadds(src, scalar)->dst + tstore(dst). +// Multiple cases with different shapes/dtypes in a single module. +// Compiled by ptoas --enable-insert-sync --enable-tile-op-expand --pto-backend=vpto +// to produce a fatobj object. + +module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { + + // Case 0: f32 32x64 (2048 elements) + func.func @TADDS_f32_32x64(%src_ptr: !pto.ptr, %dst_ptr: !pto.ptr, %scalar: f32) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c32 = arith.constant 32 : index + %c64 = arith.constant 64 : index + %c2048 = arith.constant 2048 : index + + %src_view = pto.make_tensor_view %src_ptr, + shape = [%c1, %c1, %c1, %c32, %c64], + strides = [%c2048, %c2048, %c2048, %c64, %c1] + : !pto.tensor_view<1x1x1x32x64xf32> + %dst_view = pto.make_tensor_view %dst_ptr, + shape = [%c1, %c1, %c1, %c32, %c64], + strides = [%c2048, %c2048, %c2048, %c64, %c1] + : !pto.tensor_view<1x1x1x32x64xf32> + + %src_part = pto.partition_view %src_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c32, %c64] + : !pto.tensor_view<1x1x1x32x64xf32> -> !pto.partition_tensor_view<1x1x1x32x64xf32> + %dst_part = pto.partition_view %dst_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c32, %c64] + : !pto.tensor_view<1x1x1x32x64xf32> -> !pto.partition_tensor_view<1x1x1x32x64xf32> + + %src = pto.alloc_tile + : !pto.tile_buf + %dst = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%src_part : !pto.partition_tensor_view<1x1x1x32x64xf32>) + outs(%src : !pto.tile_buf) + pto.tadds ins(%src, %scalar : !pto.tile_buf, f32) + outs(%dst : !pto.tile_buf) + pto.tstore ins(%dst : !pto.tile_buf) + outs(%dst_part : !pto.partition_tensor_view<1x1x1x32x64xf32>) + return + } + + // Case 1: f16 63x64 (4032 elements) + + func.func @TADDS_i16_15x192(%src_ptr: !pto.ptr, %dst_ptr: !pto.ptr, %scalar: i16) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c15 = arith.constant 15 : index + %c192 = arith.constant 192 : index + %c2880 = arith.constant 2880 : index + + %src_view = pto.make_tensor_view %src_ptr, + shape = [%c1, %c1, %c1, %c15, %c192], + strides = [%c2880, %c2880, %c2880, %c192, %c1] + : !pto.tensor_view<1x1x1x15x192xi16> + %dst_view = pto.make_tensor_view %dst_ptr, + shape = [%c1, %c1, %c1, %c15, %c192], + strides = [%c2880, %c2880, %c2880, %c192, %c1] + : !pto.tensor_view<1x1x1x15x192xi16> + + %src_part = pto.partition_view %src_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c15, %c192] + : !pto.tensor_view<1x1x1x15x192xi16> -> !pto.partition_tensor_view<1x1x1x15x192xi16> + %dst_part = pto.partition_view %dst_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c15, %c192] + : !pto.tensor_view<1x1x1x15x192xi16> -> !pto.partition_tensor_view<1x1x1x15x192xi16> + + %src = pto.alloc_tile + : !pto.tile_buf + %dst = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%src_part : !pto.partition_tensor_view<1x1x1x15x192xi16>) + outs(%src : !pto.tile_buf) + pto.tadds ins(%src, %scalar : !pto.tile_buf, i16) + outs(%dst : !pto.tile_buf) + pto.tstore ins(%dst : !pto.tile_buf) + outs(%dst_part : !pto.partition_tensor_view<1x1x1x15x192xi16>) + return + } + + // Case 4: f32 7x448 (3136 elements) +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tand/CMakeLists.txt b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tand/CMakeLists.txt new file mode 100644 index 000000000..3e73e2e9c --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tand/CMakeLists.txt @@ -0,0 +1,9 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +pto_tilelang_vec_st(tand) diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tand/cases.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tand/cases.py new file mode 100644 index 000000000..9c8d699c9 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tand/cases.py @@ -0,0 +1,47 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +"""Single source of truth for tand ST test cases. + +Each case defines: + - name: case identifier, used as subdirectory name and by main.cpp kCases[]. + - dtype: numpy dtype (e.g. np.int32). + - shape: (rows, cols) — allocated tile dimensions. + - valid_shape: (valid_rows, valid_cols) — effective computation region. + - eps: tolerance for numpy.allclose (atol and rtol). + +gen_data.py and compare.py both import this list to avoid redundant definitions. +""" + +import numpy as np + +CASES = [ + { + "name": "i32_16x64", + "dtype": np.int32, + "shape": (16, 64), + "valid_shape": (16, 64), + "eps": 0, + }, + { + "name": "i32_32x32", + "dtype": np.int32, + "shape": (32, 32), + "valid_shape": (32, 32), + "eps": 0, + }, +] + +_SMOKE_CASE_NAMES = ['i32_16x64', 'i32_32x32'] +_SMOKE_CASE_NAME_SET = set(_SMOKE_CASE_NAMES) +_missing = [name for name in _SMOKE_CASE_NAMES if name not in {case["name"] for case in CASES}] +if _missing: + raise RuntimeError("unknown smoke case(s): " + ", ".join(_missing)) +CASES = [case for case in CASES if case["name"] in _SMOKE_CASE_NAME_SET] diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tand/compare.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tand/compare.py new file mode 100644 index 000000000..6a4d5d1aa --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tand/compare.py @@ -0,0 +1,48 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import os +import sys +import numpy as np + +from cases import CASES +from st_common import result_cmp, style_fail, style_pass, validate_cases + + +def main(): + validate_cases(CASES) + case_filter = sys.argv[1] if len(sys.argv) > 1 else None + + all_passed = True + for case in CASES: + if case_filter is not None and case["name"] != case_filter: + continue + + case_dir = case["name"] + shape = case["shape"] + vr, vc = case["valid_shape"] + + golden = np.fromfile(os.path.join(case_dir, "golden.bin"), dtype=case["dtype"]).reshape(shape) + output = np.fromfile(os.path.join(case_dir, "output.bin"), dtype=case["dtype"]).reshape(shape) + + ok = result_cmp(golden[:vr, :vc], output[:vr, :vc], case["eps"]) + if ok: + print(style_pass(f"[INFO] {case['name']}: compare passed")) + else: + print(style_fail(f"[ERROR] {case['name']}: compare failed")) + all_passed = False + + if not all_passed: + sys.exit(2) + print(style_pass("[INFO] all cases passed")) + + +if __name__ == "__main__": + main() diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tand/gen_data.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tand/gen_data.py new file mode 100644 index 000000000..71b866734 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tand/gen_data.py @@ -0,0 +1,32 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import numpy as np +from cases import CASES +from st_common import validate_cases, setup_case_rng, save_case_data + +validate_cases(CASES) + +for case in CASES: + setup_case_rng(case) + + dtype = case["dtype"] + shape = case["shape"] + valid_shape = case["valid_shape"] + + input1 = np.random.randint(0, 100, size=shape).astype(dtype) + input2 = np.random.randint(0, 100, size=shape).astype(dtype) + + golden = np.zeros(shape, dtype=dtype) + vr, vc = valid_shape + golden[:vr, :vc] = (input1[:vr, :vc] & input2[:vr, :vc]).astype(dtype, copy=False) + + save_case_data(case["name"], {"input1": input1, "input2": input2, "golden": golden}) + print(f"[INFO] gen_data: {case['name']} shape={shape} valid_shape={valid_shape} dtype={dtype.__name__}") diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tand/launch.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tand/launch.cpp new file mode 100644 index 000000000..315b26193 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tand/launch.cpp @@ -0,0 +1,26 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +#include + +#ifndef AICORE +#define AICORE [aicore] +#endif + +// Case 0: i32 16x64 + +extern "C" __global__ AICORE void TAND_i32_16x64(__gm__ int32_t *a, __gm__ int32_t *b, __gm__ int32_t *c); +extern "C" __global__ AICORE void TAND_i32_32x32(__gm__ int32_t *a, __gm__ int32_t *b, __gm__ int32_t *c); + +void LaunchTAND_i32_32x32(int32_t *a, int32_t *b, int32_t *c, void *stream) { + TAND_i32_32x32<<<1, nullptr, stream>>>((__gm__ int32_t *)a, (__gm__ int32_t *)b, (__gm__ int32_t *)c); +} + +void LaunchTAND_i32_16x64(int32_t *a, int32_t *b, int32_t *c, void *stream) { + TAND_i32_16x64<<<1, nullptr, stream>>>((__gm__ int32_t *)a, (__gm__ int32_t *)b, (__gm__ int32_t *)c); +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tand/main.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tand/main.cpp new file mode 100644 index 000000000..d6f6e8552 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tand/main.cpp @@ -0,0 +1,145 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// Host driver for TileLang tand ST — case-table driven. +// Each case launches a different kernel variant, reads/writes from per-case subdirectory. +// Numerical comparison is done externally by compare.py. + +#include "acl/acl.h" +#include "test_common.h" +#include +#include +#include +#include +#include +#include + +using namespace PtoTestCommon; + +// Kernel launch wrappers (defined in launch.cpp) +void LaunchTAND_i32_16x64(int32_t *a, int32_t *b, int32_t *c, void *stream); +void LaunchTAND_i32_32x32(int32_t *a, int32_t *b, int32_t *c, void *stream); + +using LaunchFn = void (*)(int32_t *, int32_t *, int32_t *, void *); + +struct TestCase { + const char *name; + LaunchFn launch; + size_t rows; // allocated tile rows + size_t cols; // allocated tile cols + size_t validRows; // effective computation rows (<= rows) + size_t validCols; // effective computation cols (<= cols) + size_t elemSize; // bytes per element +}; + +static const TestCase kCases[] = { +{"i32_16x64", LaunchTAND_i32_16x64, 16, 64, 16, 64, sizeof(int32_t)}, +{"i32_32x32", LaunchTAND_i32_32x32, 32, 32, 32, 32, sizeof(int32_t)}, +}; +static constexpr size_t kNumCases = sizeof(kCases) / sizeof(kCases[0]); + +static int RunCase(const TestCase &tc, int deviceId, aclrtStream stream) { + int rc = 0; + const size_t elemCount = tc.rows * tc.cols; + const size_t fileSize = elemCount * tc.elemSize; + + std::printf("[INFO] === case: %s (shape=%zux%zu, valid=%zux%zu) ===\n", + tc.name, tc.rows, tc.cols, tc.validRows, tc.validCols); + + // Per-case data directory + std::string caseDir = std::string("./") + tc.name; + size_t src0FileSize = fileSize; + size_t src1FileSize = fileSize; + + int32_t *src0Host = nullptr, *src1Host = nullptr, *dstHost = nullptr; + int32_t *src0Device = nullptr, *src1Device = nullptr, *dstDevice = nullptr; + + aclrtMallocHost((void **)(&src0Host), fileSize); + aclrtMallocHost((void **)(&src1Host), fileSize); + aclrtMallocHost((void **)(&dstHost), fileSize); + + aclrtMalloc((void **)&src0Device, fileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc((void **)&src1Device, fileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc((void **)&dstDevice, fileSize, ACL_MEM_MALLOC_HUGE_FIRST); + + if (!ReadFile((caseDir + "/input1.bin").c_str(), src0FileSize, src0Host, fileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input1.bin\n", caseDir.c_str()); + rc = 1; + } + if (rc == 0 && !ReadFile((caseDir + "/input2.bin").c_str(), src1FileSize, src1Host, fileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input2.bin\n", caseDir.c_str()); + rc = 1; + } + + if (rc == 0) { + aclrtMemcpy(src0Device, fileSize, src0Host, fileSize, ACL_MEMCPY_HOST_TO_DEVICE); + aclrtMemcpy(src1Device, fileSize, src1Host, fileSize, ACL_MEMCPY_HOST_TO_DEVICE); + + tc.launch(src0Device, src1Device, dstDevice, stream); + + aclrtSynchronizeStream(stream); + aclrtMemcpy(dstHost, fileSize, dstDevice, fileSize, ACL_MEMCPY_DEVICE_TO_HOST); + } + + if (rc == 0 && !WriteFile((caseDir + "/output.bin").c_str(), dstHost, fileSize)) { + std::fprintf(stderr, "[ERROR] failed to write %s/output.bin\n", caseDir.c_str()); + rc = 1; + } + + if (src0Device != nullptr) + aclrtFree(src0Device); + if (src1Device != nullptr) + aclrtFree(src1Device); + if (dstDevice != nullptr) + aclrtFree(dstDevice); + if (src0Host != nullptr) + aclrtFreeHost(src0Host); + if (src1Host != nullptr) + aclrtFreeHost(src1Host); + if (dstHost != nullptr) + aclrtFreeHost(dstHost); + + if (rc == 0) + std::printf("[INFO] case %s done\n", tc.name); + return rc; +} + +int main(int argc, char *argv[]) { + // Optional case filter: ./tand [case_name] + const char *caseFilter = (argc > 1) ? argv[1] : nullptr; + + int rc = 0; + int deviceId = 0; + aclrtStream stream = nullptr; + + aclInit(nullptr); + if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { + deviceId = std::atoi(envDevice); + } + aclrtSetDevice(deviceId); + aclrtCreateStream(&stream); + + for (size_t i = 0; i < kNumCases; ++i) { + if (caseFilter != nullptr && std::strcmp(kCases[i].name, caseFilter) != 0) { + continue; + } + int ret = RunCase(kCases[i], deviceId, stream); + if (ret != 0) { + std::fprintf(stderr, "[ERROR] case %s failed\n", kCases[i].name); + rc = 1; + break; + } + } + + if (stream != nullptr) + aclrtDestroyStream(stream); + aclrtResetDevice(deviceId); + aclFinalize(); + + return rc; +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tand/tand.pto b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tand/tand.pto new file mode 100644 index 000000000..e26f01d94 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tand/tand.pto @@ -0,0 +1,124 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// TileLang ST kernels for pto.tand: tload(a) + tload(b) + tand(a,b)->c + tstore(c). +// Multiple cases with different shapes in a single module. +// Compiled by ptoas --enable-insert-sync --enable-tile-op-expand --pto-backend=vpto +// to produce a fatobj object. + +module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { + // Case 0: i32 16x64 (1024 elements) + func.func @TAND_i32_16x64(%a_ptr: !pto.ptr, %b_ptr: !pto.ptr, %c_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c16 = arith.constant 16 : index + %c64 = arith.constant 64 : index + %c1024 = arith.constant 1024 : index + + %a_view = pto.make_tensor_view %a_ptr, + shape = [%c1, %c1, %c1, %c16, %c64], + strides = [%c1024, %c1024, %c1024, %c64, %c1] + : !pto.tensor_view<1x1x1x16x64xi32> + %b_view = pto.make_tensor_view %b_ptr, + shape = [%c1, %c1, %c1, %c16, %c64], + strides = [%c1024, %c1024, %c1024, %c64, %c1] + : !pto.tensor_view<1x1x1x16x64xi32> + %c_view = pto.make_tensor_view %c_ptr, + shape = [%c1, %c1, %c1, %c16, %c64], + strides = [%c1024, %c1024, %c1024, %c64, %c1] + : !pto.tensor_view<1x1x1x16x64xi32> + + %a_part = pto.partition_view %a_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c16, %c64] + : !pto.tensor_view<1x1x1x16x64xi32> -> !pto.partition_tensor_view<1x1x1x16x64xi32> + %b_part = pto.partition_view %b_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c16, %c64] + : !pto.tensor_view<1x1x1x16x64xi32> -> !pto.partition_tensor_view<1x1x1x16x64xi32> + %c_part = pto.partition_view %c_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c16, %c64] + : !pto.tensor_view<1x1x1x16x64xi32> -> !pto.partition_tensor_view<1x1x1x16x64xi32> + + %a = pto.alloc_tile + : !pto.tile_buf + %b = pto.alloc_tile + : !pto.tile_buf + %c = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%a_part : !pto.partition_tensor_view<1x1x1x16x64xi32>) + outs(%a : !pto.tile_buf) + pto.tload ins(%b_part : !pto.partition_tensor_view<1x1x1x16x64xi32>) + outs(%b : !pto.tile_buf) + + pto.tand ins(%a, %b : !pto.tile_buf, + !pto.tile_buf) + outs(%c : !pto.tile_buf) + + pto.tstore ins(%c : !pto.tile_buf) + outs(%c_part : !pto.partition_tensor_view<1x1x1x16x64xi32>) + return + } + + // Case 1: i32 32x32 (1024 elements) + + func.func @TAND_i32_32x32(%a_ptr: !pto.ptr, %b_ptr: !pto.ptr, %c_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c32 = arith.constant 32 : index + %c1024 = arith.constant 1024 : index + + %a_view = pto.make_tensor_view %a_ptr, + shape = [%c1, %c1, %c1, %c32, %c32], + strides = [%c1024, %c1024, %c1024, %c32, %c1] + : !pto.tensor_view<1x1x1x32x32xi32> + %b_view = pto.make_tensor_view %b_ptr, + shape = [%c1, %c1, %c1, %c32, %c32], + strides = [%c1024, %c1024, %c1024, %c32, %c1] + : !pto.tensor_view<1x1x1x32x32xi32> + %c_view = pto.make_tensor_view %c_ptr, + shape = [%c1, %c1, %c1, %c32, %c32], + strides = [%c1024, %c1024, %c1024, %c32, %c1] + : !pto.tensor_view<1x1x1x32x32xi32> + + %a_part = pto.partition_view %a_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c32, %c32] + : !pto.tensor_view<1x1x1x32x32xi32> -> !pto.partition_tensor_view<1x1x1x32x32xi32> + %b_part = pto.partition_view %b_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c32, %c32] + : !pto.tensor_view<1x1x1x32x32xi32> -> !pto.partition_tensor_view<1x1x1x32x32xi32> + %c_part = pto.partition_view %c_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c32, %c32] + : !pto.tensor_view<1x1x1x32x32xi32> -> !pto.partition_tensor_view<1x1x1x32x32xi32> + + %a = pto.alloc_tile + : !pto.tile_buf + %b = pto.alloc_tile + : !pto.tile_buf + %c = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%a_part : !pto.partition_tensor_view<1x1x1x32x32xi32>) + outs(%a : !pto.tile_buf) + pto.tload ins(%b_part : !pto.partition_tensor_view<1x1x1x32x32xi32>) + outs(%b : !pto.tile_buf) + + pto.tand ins(%a, %b : !pto.tile_buf, + !pto.tile_buf) + outs(%c : !pto.tile_buf) + + pto.tstore ins(%c : !pto.tile_buf) + outs(%c_part : !pto.partition_tensor_view<1x1x1x32x32xi32>) + return + } +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tands/CMakeLists.txt b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tands/CMakeLists.txt new file mode 100644 index 000000000..0ff088f8d --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tands/CMakeLists.txt @@ -0,0 +1,9 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +pto_tilelang_vec_st(tands) diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tands/cases.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tands/cases.py new file mode 100644 index 000000000..30222ef97 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tands/cases.py @@ -0,0 +1,49 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import numpy as np + +CASES = [ + { + "name": "i32_32x64", + "dtype": np.int32, + "shape": (32, 64), + "valid_shape": (32, 64), + "eps": 0, + }, + { + "name": "i16_63x64", + "dtype": np.int16, + "shape": (63, 64), + "valid_shape": (63, 64), + "eps": 0, + }, + { + "name": "i32_31x128", + "dtype": np.int32, + "shape": (31, 128), + "valid_shape": (31, 128), + "eps": 0, + }, + { + "name": "i16_15x192", + "dtype": np.int16, + "shape": (15, 192), + "valid_shape": (15, 192), + "eps": 0, + }, +] + +_SMOKE_CASE_NAMES = ['i32_32x64', 'i16_15x192'] +_SMOKE_CASE_NAME_SET = set(_SMOKE_CASE_NAMES) +_missing = [name for name in _SMOKE_CASE_NAMES if name not in {case["name"] for case in CASES}] +if _missing: + raise RuntimeError("unknown smoke case(s): " + ", ".join(_missing)) +CASES = [case for case in CASES if case["name"] in _SMOKE_CASE_NAME_SET] diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tands/compare.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tands/compare.py new file mode 100644 index 000000000..50186777e --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tands/compare.py @@ -0,0 +1,46 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import os +import sys +import numpy as np + +from cases import CASES +from st_common import result_cmp, style_fail, style_pass, validate_cases + +def main(): + validate_cases(CASES) + case_filter = sys.argv[1] if len(sys.argv) > 1 else None + + all_passed = True + for case in CASES: + if case_filter is not None and case["name"] != case_filter: + continue + + case_dir = case["name"] + shape = case["shape"] + vr, vc = case["valid_shape"] + + golden = np.fromfile(os.path.join(case_dir, "golden.bin"), dtype=case["dtype"]).reshape(shape) + output = np.fromfile(os.path.join(case_dir, "output.bin"), dtype=case["dtype"]).reshape(shape) + + ok = result_cmp(golden[:vr, :vc], output[:vr, :vc], case["eps"]) + if ok: + print(style_pass(f"[INFO] {case['name']}: compare passed")) + else: + print(style_fail(f"[ERROR] {case['name']}: compare failed")) + all_passed = False + + if not all_passed: + sys.exit(2) + print(style_pass("[INFO] all cases passed")) + +if __name__ == "__main__": + main() diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tands/gen_data.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tands/gen_data.py new file mode 100644 index 000000000..9f187cb03 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tands/gen_data.py @@ -0,0 +1,35 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import numpy as np +from cases import CASES +from st_common import validate_cases, setup_case_rng, save_case_data + +# Scalar value for bitwise AND (must match launch.cpp) +SCALAR = 3 + +validate_cases(CASES) + +for case in CASES: + setup_case_rng(case) + + dtype = case["dtype"] + shape = case["shape"] + valid_shape = case["valid_shape"] + + input1 = np.random.randint(1, 10, size=shape).astype(dtype) + + golden = np.zeros(shape, dtype=dtype) + vr, vc = valid_shape + scalar_val = dtype(SCALAR) + golden[:vr, :vc] = (input1[:vr, :vc] & scalar_val).astype(dtype, copy=False) + + save_case_data(case["name"], {"input1": input1, "golden": golden}) + print(f"[INFO] gen_data: {case['name']} shape={shape} valid_shape={valid_shape} dtype={dtype.__name__} scalar={SCALAR}") diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tands/launch.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tands/launch.cpp new file mode 100644 index 000000000..231e143de --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tands/launch.cpp @@ -0,0 +1,31 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +#include + +#ifndef AICORE +#define AICORE [aicore] +#endif + +// Scalar value for bitwise AND (must match gen_data.py SCALAR) +static constexpr int32_t TANDS_SCALAR_I32 = 3; +static constexpr int16_t TANDS_SCALAR_I16 = 3; + +// Case 0: i32 32x64 + +extern "C" __global__ AICORE void TANDS_i32_32x64(__gm__ int32_t *src, __gm__ int32_t *dst, int32_t scalar); +extern "C" __global__ AICORE void TANDS_i16_15x192(__gm__ int16_t *src, __gm__ int16_t *dst, int16_t scalar); + +void LaunchTANDS_i16_15x192(int16_t *src, int16_t *dst, void *stream) { + TANDS_i16_15x192<<<1, nullptr, stream>>>((__gm__ int16_t *)src, (__gm__ int16_t *)dst, TANDS_SCALAR_I16); +} + + +void LaunchTANDS_i32_32x64(int32_t *src, int32_t *dst, void *stream) { + TANDS_i32_32x64<<<1, nullptr, stream>>>((__gm__ int32_t *)src, (__gm__ int32_t *)dst, TANDS_SCALAR_I32); +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tands/main.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tands/main.cpp new file mode 100644 index 000000000..1daa70337 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tands/main.cpp @@ -0,0 +1,132 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// Host driver for TileLang tands ST — case-table driven. +// tands: dst = src & scalar (single input + scalar, bitwise AND). +// Numerical comparison is done externally by compare.py. + +#include "acl/acl.h" +#include "test_common.h" +#include +#include +#include +#include +#include +#include + +using namespace PtoTestCommon; + +// Kernel launch wrappers (defined in launch.cpp) +void LaunchTANDS_i32_32x64(int32_t *src, int32_t *dst, void *stream); +void LaunchTANDS_i16_63x64(int16_t *src, int16_t *dst, void *stream); +void LaunchTANDS_i16_15x192(int16_t *src, int16_t *dst, void *stream); + +struct TestCase { + const char *name; + void (*launch)(void *, void *, void *); // src, dst, stream + size_t rows; // allocated tile rows + size_t cols; // allocated tile cols + size_t validRows; // effective computation rows (<= rows) + size_t validCols; // effective computation cols (<= cols) + size_t elemSize; // bytes per element +}; + +static const TestCase kCases[] = { +{"i32_32x64", (void (*)(void*,void*,void*))LaunchTANDS_i32_32x64, 32, 64, 32, 64, sizeof(int32_t)}, +{"i16_15x192", (void (*)(void*,void*,void*))LaunchTANDS_i16_15x192, 15, 192, 15, 192, sizeof(int16_t)}, +}; +static constexpr size_t kNumCases = sizeof(kCases) / sizeof(kCases[0]); + +static int RunCase(const TestCase &tc, int deviceId, aclrtStream stream) { + int rc = 0; + const size_t elemCount = tc.rows * tc.cols; + const size_t fileSize = elemCount * tc.elemSize; + + std::printf("[INFO] === case: %s (shape=%zux%zu, valid=%zux%zu) ===\n", + tc.name, tc.rows, tc.cols, tc.validRows, tc.validCols); + + // Per-case data directory + std::string caseDir = std::string("./") + tc.name; + size_t srcFileSize = fileSize; + + void *srcHost = nullptr, *dstHost = nullptr; + void *srcDevice = nullptr, *dstDevice = nullptr; + + aclrtMallocHost(&srcHost, fileSize); + aclrtMallocHost(&dstHost, fileSize); + + aclrtMalloc(&srcDevice, fileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc(&dstDevice, fileSize, ACL_MEM_MALLOC_HUGE_FIRST); + + if (!ReadFile((caseDir + "/input1.bin").c_str(), srcFileSize, srcHost, fileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input1.bin\n", caseDir.c_str()); + rc = 1; + } + + if (rc == 0) { + aclrtMemcpy(srcDevice, fileSize, srcHost, fileSize, ACL_MEMCPY_HOST_TO_DEVICE); + + tc.launch(srcDevice, dstDevice, stream); + + aclrtSynchronizeStream(stream); + aclrtMemcpy(dstHost, fileSize, dstDevice, fileSize, ACL_MEMCPY_DEVICE_TO_HOST); + } + + if (rc == 0 && !WriteFile((caseDir + "/output.bin").c_str(), dstHost, fileSize)) { + std::fprintf(stderr, "[ERROR] failed to write %s/output.bin\n", caseDir.c_str()); + rc = 1; + } + + if (srcDevice != nullptr) + aclrtFree(srcDevice); + if (dstDevice != nullptr) + aclrtFree(dstDevice); + if (srcHost != nullptr) + aclrtFreeHost(srcHost); + if (dstHost != nullptr) + aclrtFreeHost(dstHost); + + if (rc == 0) + std::printf("[INFO] case %s done\n", tc.name); + return rc; +} + +int main(int argc, char *argv[]) { + // Optional case filter: ./tands [case_name] + const char *caseFilter = (argc > 1) ? argv[1] : nullptr; + + int rc = 0; + int deviceId = 0; + aclrtStream stream = nullptr; + + aclInit(nullptr); + if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { + deviceId = std::atoi(envDevice); + } + aclrtSetDevice(deviceId); + aclrtCreateStream(&stream); + + for (size_t i = 0; i < kNumCases; ++i) { + if (caseFilter != nullptr && std::strcmp(kCases[i].name, caseFilter) != 0) { + continue; + } + int ret = RunCase(kCases[i], deviceId, stream); + if (ret != 0) { + std::fprintf(stderr, "[ERROR] case %s failed\n", kCases[i].name); + rc = 1; + break; + } + } + + if (stream != nullptr) + aclrtDestroyStream(stream); + aclrtResetDevice(deviceId); + aclFinalize(); + + return rc; +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tands/tands.pto b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tands/tands.pto new file mode 100644 index 000000000..220c03f4b --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tands/tands.pto @@ -0,0 +1,96 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// TileLang ST kernels for pto.tands: tload(src) + tands(src, scalar)->dst + tstore(dst). +// Multiple cases with different shapes/dtypes in a single module. +// Compiled by ptoas --enable-insert-sync --enable-tile-op-expand --pto-backend=vpto +// to produce a fatobj object. + +module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { + + // Case 0: i32 32x64 (2048 elements) + func.func @TANDS_i32_32x64(%src_ptr: !pto.ptr, %dst_ptr: !pto.ptr, %scalar: i32) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c32 = arith.constant 32 : index + %c64 = arith.constant 64 : index + %c2048 = arith.constant 2048 : index + + %src_view = pto.make_tensor_view %src_ptr, + shape = [%c1, %c1, %c1, %c32, %c64], + strides = [%c2048, %c2048, %c2048, %c64, %c1] + : !pto.tensor_view<1x1x1x32x64xi32> + %dst_view = pto.make_tensor_view %dst_ptr, + shape = [%c1, %c1, %c1, %c32, %c64], + strides = [%c2048, %c2048, %c2048, %c64, %c1] + : !pto.tensor_view<1x1x1x32x64xi32> + + %src_part = pto.partition_view %src_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c32, %c64] + : !pto.tensor_view<1x1x1x32x64xi32> -> !pto.partition_tensor_view<1x1x1x32x64xi32> + %dst_part = pto.partition_view %dst_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c32, %c64] + : !pto.tensor_view<1x1x1x32x64xi32> -> !pto.partition_tensor_view<1x1x1x32x64xi32> + + %src = pto.alloc_tile + : !pto.tile_buf + %dst = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%src_part : !pto.partition_tensor_view<1x1x1x32x64xi32>) + outs(%src : !pto.tile_buf) + pto.tands ins(%src, %scalar : !pto.tile_buf, i32) + outs(%dst : !pto.tile_buf) + pto.tstore ins(%dst : !pto.tile_buf) + outs(%dst_part : !pto.partition_tensor_view<1x1x1x32x64xi32>) + return + } + + // Case 1: i16 63x64 (4032 elements) + + func.func @TANDS_i16_15x192(%src_ptr: !pto.ptr, %dst_ptr: !pto.ptr, %scalar: i16) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c15 = arith.constant 15 : index + %c192 = arith.constant 192 : index + %c2880 = arith.constant 2880 : index + + %src_view = pto.make_tensor_view %src_ptr, + shape = [%c1, %c1, %c1, %c15, %c192], + strides = [%c2880, %c2880, %c2880, %c192, %c1] + : !pto.tensor_view<1x1x1x15x192xi16> + %dst_view = pto.make_tensor_view %dst_ptr, + shape = [%c1, %c1, %c1, %c15, %c192], + strides = [%c2880, %c2880, %c2880, %c192, %c1] + : !pto.tensor_view<1x1x1x15x192xi16> + + %src_part = pto.partition_view %src_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c15, %c192] + : !pto.tensor_view<1x1x1x15x192xi16> -> !pto.partition_tensor_view<1x1x1x15x192xi16> + %dst_part = pto.partition_view %dst_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c15, %c192] + : !pto.tensor_view<1x1x1x15x192xi16> -> !pto.partition_tensor_view<1x1x1x15x192xi16> + + %src = pto.alloc_tile + : !pto.tile_buf + %dst = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%src_part : !pto.partition_tensor_view<1x1x1x15x192xi16>) + outs(%src : !pto.tile_buf) + pto.tands ins(%src, %scalar : !pto.tile_buf, i16) + outs(%dst : !pto.tile_buf) + pto.tstore ins(%dst : !pto.tile_buf) + outs(%dst_part : !pto.partition_tensor_view<1x1x1x15x192xi16>) + return + } +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcmp/CMakeLists.txt b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcmp/CMakeLists.txt new file mode 100644 index 000000000..ebb9c0dae --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcmp/CMakeLists.txt @@ -0,0 +1,9 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +pto_tilelang_vec_st(tcmp) diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcmp/cases.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcmp/cases.py new file mode 100644 index 000000000..44e2eb225 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcmp/cases.py @@ -0,0 +1,147 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +"""Single source of truth for tcmp ST test cases. + +Each case defines: + - name: case identifier, used as subdirectory name and by main.cpp kCases[]. + - dtype: numpy dtype (e.g. np.float32). + - shape: (rows, cols) — allocated tile dimensions (same for src and dst). + - valid_shape: (valid_rows, valid_cols) — effective computation region. + - dst_dtype: output mask dtype (i8 - packed mask, same shape as input). + - cmp_mode: comparison mode: "eq", "ne", "lt", "gt", "ge", "le". + - eps: tolerance (exact match for masks, eps=0). + +Aligned with testcase/tcmp test cases. + +gen_data.py and compare.py both import this list to avoid redundant definitions. +""" + +import numpy as np + +CASES = [ + # Case 1: f16 32x32 EQ (half_32x32_32x32) + { + "name": "f16_32x32_eq", + "dtype": np.float16, + "shape": (32, 32), + "valid_shape": (32, 32), + "dst_dtype": np.int8, + "cmp_mode": "eq", + "eps": 0, + }, + # Case 2: f32 8x64 GT (float_8x64_8x64) + { + "name": "f32_8x64_gt", + "dtype": np.float32, + "shape": (8, 64), + "valid_shape": (8, 64), + "dst_dtype": np.int8, + "cmp_mode": "gt", + "eps": 0, + }, + # Case 3: i32 4x64 NE (int32_4x64_4x64) + { + "name": "i32_4x64_ne", + "dtype": np.int32, + "shape": (4, 64), + "valid_shape": (4, 64), + "dst_dtype": np.int8, + "cmp_mode": "ne", + "eps": 0, + }, + # Case 4: i32 128x128 LT with valid 64x64 (int32_128x128_64x64) + { + "name": "i32_128x128_lt", + "dtype": np.int32, + "shape": (128, 128), + "valid_shape": (64, 64), + "dst_dtype": np.int8, + "cmp_mode": "lt", + "eps": 0, + }, + # Case 5: i32 64x64 EQ with valid 32x32 (int32_64x64_32x32) + { + "name": "i32_64x64_eq", + "dtype": np.int32, + "shape": (64, 64), + "valid_shape": (32, 32), + "dst_dtype": np.int8, + "cmp_mode": "eq", + "eps": 0, + }, + # Case 6: i32 16x32 EQ (int32_16x32_16x32) + { + "name": "i32_16x32_eq", + "dtype": np.int32, + "shape": (16, 32), + "valid_shape": (16, 32), + "dst_dtype": np.int8, + "cmp_mode": "eq", + "eps": 0, + }, + # Case 7: f32 128x128 LE with valid 64x64 (float_128x128_64x64) + { + "name": "f32_128x128_le", + "dtype": np.float32, + "shape": (128, 128), + "valid_shape": (64, 64), + "dst_dtype": np.int8, + "cmp_mode": "le", + "eps": 0, + }, + # Case 8: i32 77x96 EQ with valid 32x32 (int32_77x96_32x32) + { + "name": "i32_77x96_eq", + "dtype": np.int32, + "shape": (77, 96), + "valid_shape": (32, 32), + "dst_dtype": np.int8, + "cmp_mode": "eq", + "eps": 0, + }, + # Case 9: i32 32x32 EQ (int32_32x32_32x32) + { + "name": "i32_32x32_eq", + "dtype": np.int32, + "shape": (32, 32), + "valid_shape": (32, 32), + "dst_dtype": np.int8, + "cmp_mode": "eq", + "eps": 0, + }, + # Case 10: i16 32x32 EQ with valid 16x32 (int16_32x32_16x32) + { + "name": "i16_32x32_eq", + "dtype": np.int16, + "shape": (32, 32), + "valid_shape": (16, 32), + "dst_dtype": np.int8, + "cmp_mode": "eq", + "eps": 0, + }, + # Case 11: i16 77x96 LE with valid 32x32 (int16_77x96_32x32) + { + "name": "i16_77x96_le", + "dtype": np.int16, + "shape": (77, 96), + "valid_shape": (32, 32), + "dst_dtype": np.int8, + "cmp_mode": "le", + "eps": 0, + }, +] + +_SMOKE_CASE_NAMES = ['f32_8x64_gt', 'i32_4x64_ne', 'i16_32x32_eq'] +_SMOKE_CASE_NAME_SET = set(_SMOKE_CASE_NAMES) +_missing = [name for name in _SMOKE_CASE_NAMES if name not in {case["name"] for case in CASES}] +if _missing: + raise RuntimeError("unknown smoke case(s): " + ", ".join(_missing)) +CASES = [case for case in CASES if case["name"] in _SMOKE_CASE_NAME_SET] diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcmp/compare.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcmp/compare.py new file mode 100644 index 000000000..56b4729f6 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcmp/compare.py @@ -0,0 +1,54 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import os +import sys +import numpy as np + +from cases import CASES +from st_common import result_cmp, style_fail, style_pass, validate_cases + + +def main(): + validate_cases(CASES) + case_filter = sys.argv[1] if len(sys.argv) > 1 else None + + all_passed = True + for case in CASES: + if case_filter is not None and case["name"] != case_filter: + continue + + case_dir = case["name"] + shape = case["shape"] + dst_dtype = case["dst_dtype"] + valid_shape = case["valid_shape"] + vr, vc = valid_shape + + # Only compare the packed mask region: rows x (cols//8) + packed_cols = vc // 8 + + golden = np.fromfile(os.path.join(case_dir, "golden.bin"), dtype=dst_dtype).reshape(shape) + output = np.fromfile(os.path.join(case_dir, "output.bin"), dtype=dst_dtype).reshape(shape) + + # Compare packed mask output in valid region + ok = result_cmp(golden[:vr, :packed_cols], output[:vr, :packed_cols], case["eps"]) + if ok: + print(style_pass(f"[INFO] {case['name']}: compare passed")) + else: + print(style_fail(f"[ERROR] {case['name']}: compare failed")) + all_passed = False + + if not all_passed: + sys.exit(2) + print(style_pass("[INFO] all cases passed")) + + +if __name__ == "__main__": + main() diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcmp/gen_data.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcmp/gen_data.py new file mode 100644 index 000000000..f28e6e93d --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcmp/gen_data.py @@ -0,0 +1,70 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import numpy as np +from cases import CASES +from st_common import validate_cases, setup_case_rng, save_case_data + +validate_cases(CASES) + +for case in CASES: + setup_case_rng(case) + + dtype = case["dtype"] + shape = case["shape"] + valid_shape = case["valid_shape"] + dst_dtype = case["dst_dtype"] + cmp_mode = case["cmp_mode"] + + # Generate random input data + input1 = np.random.randint(1, 10, size=shape).astype(dtype) + input2 = np.random.randint(1, 10, size=shape).astype(dtype) + + # Compute comparison mask (boolean) + vr, vc = valid_shape + mask_bits = np.zeros(shape, dtype=np.bool_) + input1_valid = input1[:vr, :vc] + input2_valid = input2[:vr, :vc] + + if cmp_mode == "eq": + mask_bits[:vr, :vc] = (input1_valid == input2_valid) + elif cmp_mode == "ne": + mask_bits[:vr, :vc] = (input1_valid != input2_valid) + elif cmp_mode == "lt": + mask_bits[:vr, :vc] = (input1_valid < input2_valid) + elif cmp_mode == "gt": + mask_bits[:vr, :vc] = (input1_valid > input2_valid) + elif cmp_mode == "ge": + mask_bits[:vr, :vc] = (input1_valid >= input2_valid) + elif cmp_mode == "le": + mask_bits[:vr, :vc] = (input1_valid <= input2_valid) + + # dst shape is same as src shape, but only first cols//8 columns store packed mask bytes + # remaining columns are padding (zeros) + # Use uint8 first to avoid overflow, then cast to int8 + golden = np.zeros(shape, dtype=np.uint8) + + # Pack mask bits: each byte stores 8 comparison results (1 bit each) + packed_cols = vc // 8 # number of byte columns that store actual packed data + + for row in range(vr): + for col_byte in range(packed_cols): + byte_val = 0 + for bit in range(8): + src_col = col_byte * 8 + bit + if src_col < vc and mask_bits[row, src_col]: + byte_val |= (1 << bit) + golden[row, col_byte] = byte_val + + # Cast to int8 for final output + golden = golden.astype(dst_dtype) + + save_case_data(case["name"], {"input1": input1, "input2": input2, "golden": golden}) + print(f"[INFO] gen_data: {case['name']} shape={shape} valid_shape={valid_shape} cmp_mode={cmp_mode}") diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcmp/launch.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcmp/launch.cpp new file mode 100644 index 000000000..2f87169a0 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcmp/launch.cpp @@ -0,0 +1,35 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +#include + +#ifndef AICORE +#define AICORE [aicore] +#endif + +// Case 1: f16 32x32 eq (half_32x32_32x32) + +extern "C" __global__ AICORE void TCMP_f32_8x64_gt(__gm__ float *a, __gm__ float *b, __gm__ int8_t *c); +extern "C" __global__ AICORE void TCMP_i32_4x64_ne(__gm__ int32_t *a, __gm__ int32_t *b, __gm__ int8_t *c); +extern "C" __global__ AICORE void TCMP_i16_32x32_eq(__gm__ int16_t *a, __gm__ int16_t *b, __gm__ int8_t *c); + +void LaunchTCMP_f32_8x64_gt(float *a, float *b, int8_t *c, void *stream) { + TCMP_f32_8x64_gt<<<1, nullptr, stream>>>((__gm__ float *)a, (__gm__ float *)b, (__gm__ int8_t *)c); +} + + + +void LaunchTCMP_i32_4x64_ne(int32_t *a, int32_t *b, int8_t *c, void *stream) { + TCMP_i32_4x64_ne<<<1, nullptr, stream>>>((__gm__ int32_t *)a, (__gm__ int32_t *)b, (__gm__ int8_t *)c); +} + + + +void LaunchTCMP_i16_32x32_eq(int16_t *a, int16_t *b, int8_t *c, void *stream) { + TCMP_i16_32x32_eq<<<1, nullptr, stream>>>((__gm__ int16_t *)a, (__gm__ int16_t *)b, (__gm__ int8_t *)c); +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcmp/main.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcmp/main.cpp new file mode 100644 index 000000000..da693715b --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcmp/main.cpp @@ -0,0 +1,149 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// Host driver for TileLang tcmp ST — case-table driven. +// Each case launches a different kernel variant, reads/writes from per-case subdirectory. +// Numerical comparison is done externally by compare.py. +// Aligned with testcase/tcmp test cases. + +#include "acl/acl.h" +#include "test_common.h" +#include +#include +#include +#include +#include +#include + +using namespace PtoTestCommon; + +// Kernel launch wrappers (defined in launch.cpp) +void LaunchTCMP_f32_8x64_gt(float *a, float *b, int8_t *c, void *stream); +void LaunchTCMP_i32_4x64_ne(int32_t *a, int32_t *b, int8_t *c, void *stream); +void LaunchTCMP_i32_128x128_lt(int32_t *a, int32_t *b, int8_t *c, void *stream); +void LaunchTCMP_i32_16x32_eq(int32_t *a, int32_t *b, int8_t *c, void *stream); +void LaunchTCMP_i32_77x96_eq(int32_t *a, int32_t *b, int8_t *c, void *stream); +void LaunchTCMP_i16_32x32_eq(int16_t *a, int16_t *b, int8_t *c, void *stream); + +struct TestCase { + const char *name; + void (*launch)(void *, void *, void *, void *); + size_t rows; + size_t cols; + size_t srcElemSize; + size_t dstElemSize; +}; + +static const TestCase kCases[] = { + // Case 1: f16 32x32 eq (half_32x32_32x32) +{"f32_8x64_gt", (void (*)(void*, void*, void*, void*))LaunchTCMP_f32_8x64_gt, 8, 64, sizeof(float), sizeof(int8_t)}, +{"i32_4x64_ne", (void (*)(void*, void*, void*, void*))LaunchTCMP_i32_4x64_ne, 4, 64, sizeof(int32_t), sizeof(int8_t)}, +{"i16_32x32_eq", (void (*)(void*, void*, void*, void*))LaunchTCMP_i16_32x32_eq, 32, 32, sizeof(int16_t), sizeof(int8_t)}, +}; +static constexpr size_t kNumCases = sizeof(kCases) / sizeof(kCases[0]); + +static int RunCase(const TestCase &tc, int deviceId, aclrtStream stream) { + int rc = 0; + const size_t srcFileSize = tc.rows * tc.cols * tc.srcElemSize; + const size_t dstFileSize = tc.rows * tc.cols * tc.dstElemSize; + + std::printf("[INFO] === case: %s (shape=%zux%zu) ===\n", + tc.name, tc.rows, tc.cols); + + std::string caseDir = std::string("./") + tc.name; + + void *src0Host = nullptr, *src1Host = nullptr, *dstHost = nullptr; + void *src0Device = nullptr, *src1Device = nullptr, *dstDevice = nullptr; + + aclrtMallocHost((void **)(&src0Host), srcFileSize); + aclrtMallocHost((void **)(&src1Host), srcFileSize); + aclrtMallocHost((void **)(&dstHost), dstFileSize); + + aclrtMalloc((void **)&src0Device, srcFileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc((void **)&src1Device, srcFileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc((void **)&dstDevice, dstFileSize, ACL_MEM_MALLOC_HUGE_FIRST); + + size_t src0FileSize = srcFileSize; + size_t src1FileSize = srcFileSize; + size_t dstFileSizeActual = dstFileSize; + + if (!ReadFile((caseDir + "/input1.bin").c_str(), src0FileSize, src0Host, srcFileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input1.bin\n", caseDir.c_str()); + rc = 1; + } + if (rc == 0 && !ReadFile((caseDir + "/input2.bin").c_str(), src1FileSize, src1Host, srcFileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input2.bin\n", caseDir.c_str()); + rc = 1; + } + + if (rc == 0) { + aclrtMemcpy(src0Device, srcFileSize, src0Host, srcFileSize, ACL_MEMCPY_HOST_TO_DEVICE); + aclrtMemcpy(src1Device, srcFileSize, src1Host, srcFileSize, ACL_MEMCPY_HOST_TO_DEVICE); + + tc.launch(src0Device, src1Device, dstDevice, stream); + + aclrtSynchronizeStream(stream); + aclrtMemcpy(dstHost, dstFileSize, dstDevice, dstFileSize, ACL_MEMCPY_DEVICE_TO_HOST); + } + + if (rc == 0 && !WriteFile((caseDir + "/output.bin").c_str(), dstHost, dstFileSizeActual)) { + std::fprintf(stderr, "[ERROR] failed to write %s/output.bin\n", caseDir.c_str()); + rc = 1; + } + + if (src0Device != nullptr) + aclrtFree(src0Device); + if (src1Device != nullptr) + aclrtFree(src1Device); + if (dstDevice != nullptr) + aclrtFree(dstDevice); + if (src0Host != nullptr) + aclrtFreeHost(src0Host); + if (src1Host != nullptr) + aclrtFreeHost(src1Host); + if (dstHost != nullptr) + aclrtFreeHost(dstHost); + + if (rc == 0) + std::printf("[INFO] case %s done\n", tc.name); + return rc; +} + +int main(int argc, char *argv[]) { + const char *caseFilter = (argc > 1) ? argv[1] : nullptr; + + int rc = 0; + int deviceId = 0; + aclrtStream stream = nullptr; + + aclInit(nullptr); + if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { + deviceId = std::atoi(envDevice); + } + aclrtSetDevice(deviceId); + aclrtCreateStream(&stream); + + for (size_t i = 0; i < kNumCases; ++i) { + if (caseFilter != nullptr && std::strcmp(kCases[i].name, caseFilter) != 0) { + continue; + } + int ret = RunCase(kCases[i], deviceId, stream); + if (ret != 0) { + std::fprintf(stderr, "[ERROR] case %s failed\n", kCases[i].name); + rc = 1; + break; + } + } + + if (stream != nullptr) + aclrtDestroyStream(stream); + aclrtResetDevice(deviceId); + aclFinalize(); + + return rc; +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcmp/tcmp.pto b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcmp/tcmp.pto new file mode 100644 index 000000000..0847b4c4e --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcmp/tcmp.pto @@ -0,0 +1,209 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// TileLang ST kernels for pto.tcmp: tload(a) + tload(b) + tcmp(a,b)->c(mask) + tstore(c). +// Output mask is packed: 1 bit per element, stored as i8 array (same shape as input). +// Aligned with testcase/tcmp test cases. + +module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { + // Case 1: f16 32x32 EQ (half_32x32_32x32) + func.func @TCMP_f32_8x64_gt(%a_ptr: !pto.ptr, %b_ptr: !pto.ptr, %c_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c8 = arith.constant 8 : index + %c64 = arith.constant 64 : index + %c512 = arith.constant 512 : index + + %a_view = pto.make_tensor_view %a_ptr, + shape = [%c1, %c1, %c1, %c8, %c64], + strides = [%c512, %c512, %c512, %c64, %c1] + : !pto.tensor_view<1x1x1x8x64xf32> + %b_view = pto.make_tensor_view %b_ptr, + shape = [%c1, %c1, %c1, %c8, %c64], + strides = [%c512, %c512, %c512, %c64, %c1] + : !pto.tensor_view<1x1x1x8x64xf32> + %c_view = pto.make_tensor_view %c_ptr, + shape = [%c1, %c1, %c1, %c8, %c64], + strides = [%c512, %c512, %c512, %c64, %c1] + : !pto.tensor_view<1x1x1x8x64xi8> + + %a_part = pto.partition_view %a_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c8, %c64] + : !pto.tensor_view<1x1x1x8x64xf32> -> !pto.partition_tensor_view<1x1x1x8x64xf32> + %b_part = pto.partition_view %b_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c8, %c64] + : !pto.tensor_view<1x1x1x8x64xf32> -> !pto.partition_tensor_view<1x1x1x8x64xf32> + %c_part = pto.partition_view %c_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c8, %c64] + : !pto.tensor_view<1x1x1x8x64xi8> -> !pto.partition_tensor_view<1x1x1x8x64xi8> + + %a = pto.alloc_tile + : !pto.tile_buf + %b = pto.alloc_tile + : !pto.tile_buf + %c = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%a_part : !pto.partition_tensor_view<1x1x1x8x64xf32>) + outs(%a : !pto.tile_buf) + pto.tload ins(%b_part : !pto.partition_tensor_view<1x1x1x8x64xf32>) + outs(%b : !pto.tile_buf) + + pto.tcmp ins(%a, %b {cmpMode = #pto} : !pto.tile_buf, + !pto.tile_buf) + outs(%c : !pto.tile_buf) + + pto.tstore ins(%c : !pto.tile_buf) + outs(%c_part : !pto.partition_tensor_view<1x1x1x8x64xi8>) + return + } + + // Case 3: i32 4x64 NE (int32_4x64_4x64) + + func.func @TCMP_i32_4x64_ne(%a_ptr: !pto.ptr, %b_ptr: !pto.ptr, %c_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c4 = arith.constant 4 : index + %c64 = arith.constant 64 : index + %c256 = arith.constant 256 : index + + %a_view = pto.make_tensor_view %a_ptr, + shape = [%c1, %c1, %c1, %c4, %c64], + strides = [%c256, %c256, %c256, %c64, %c1] + : !pto.tensor_view<1x1x1x4x64xi32> + %b_view = pto.make_tensor_view %b_ptr, + shape = [%c1, %c1, %c1, %c4, %c64], + strides = [%c256, %c256, %c256, %c64, %c1] + : !pto.tensor_view<1x1x1x4x64xi32> + %c_view = pto.make_tensor_view %c_ptr, + shape = [%c1, %c1, %c1, %c4, %c64], + strides = [%c256, %c256, %c256, %c64, %c1] + : !pto.tensor_view<1x1x1x4x64xi8> + + %a_part = pto.partition_view %a_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c4, %c64] + : !pto.tensor_view<1x1x1x4x64xi32> -> !pto.partition_tensor_view<1x1x1x4x64xi32> + %b_part = pto.partition_view %b_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c4, %c64] + : !pto.tensor_view<1x1x1x4x64xi32> -> !pto.partition_tensor_view<1x1x1x4x64xi32> + %c_part = pto.partition_view %c_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c4, %c64] + : !pto.tensor_view<1x1x1x4x64xi8> -> !pto.partition_tensor_view<1x1x1x4x64xi8> + + %a = pto.alloc_tile + : !pto.tile_buf + %b = pto.alloc_tile + : !pto.tile_buf + %c = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%a_part : !pto.partition_tensor_view<1x1x1x4x64xi32>) + outs(%a : !pto.tile_buf) + pto.tload ins(%b_part : !pto.partition_tensor_view<1x1x1x4x64xi32>) + outs(%b : !pto.tile_buf) + + pto.tcmp ins(%a, %b {cmpMode = #pto} : !pto.tile_buf, + !pto.tile_buf) + outs(%c : !pto.tile_buf) + + pto.tstore ins(%c : !pto.tile_buf) + outs(%c_part : !pto.partition_tensor_view<1x1x1x4x64xi8>) + return + } + + // Case 4: i32 128x128 LT with valid 64x64 (int32_128x128_64x64) + + func.func @TCMP_i16_32x32_eq(%a_ptr: !pto.ptr, %b_ptr: !pto.ptr, %c_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c32 = arith.constant 32 : index + %c1024 = arith.constant 1024 : index + + %a_view = pto.make_tensor_view %a_ptr, + shape = [%c1, %c1, %c1, %c32, %c32], + strides = [%c1024, %c1024, %c1024, %c32, %c1] + : !pto.tensor_view<1x1x1x32x32xi16> + %b_view = pto.make_tensor_view %b_ptr, + shape = [%c1, %c1, %c1, %c32, %c32], + strides = [%c1024, %c1024, %c1024, %c32, %c1] + : !pto.tensor_view<1x1x1x32x32xi16> + %c_view = pto.make_tensor_view %c_ptr, + shape = [%c1, %c1, %c1, %c32, %c32], + strides = [%c1024, %c1024, %c1024, %c32, %c1] + : !pto.tensor_view<1x1x1x32x32xi8> + + %a_part = pto.partition_view %a_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c32, %c32] + : !pto.tensor_view<1x1x1x32x32xi16> -> !pto.partition_tensor_view<1x1x1x32x32xi16> + %b_part = pto.partition_view %b_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c32, %c32] + : !pto.tensor_view<1x1x1x32x32xi16> -> !pto.partition_tensor_view<1x1x1x32x32xi16> + %c16 = arith.constant 16 : index + %c_part = pto.partition_view %c_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c16, %c32] + : !pto.tensor_view<1x1x1x32x32xi8> -> !pto.partition_tensor_view<1x1x1x16x32xi8> + + %a = pto.alloc_tile + : !pto.tile_buf + %b = pto.alloc_tile + : !pto.tile_buf + %c = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%a_part : !pto.partition_tensor_view<1x1x1x32x32xi16>) + outs(%a : !pto.tile_buf) + pto.tload ins(%b_part : !pto.partition_tensor_view<1x1x1x32x32xi16>) + outs(%b : !pto.tile_buf) + + pto.tcmp ins(%a, %b {cmpMode = #pto} : !pto.tile_buf, + !pto.tile_buf) + outs(%c : !pto.tile_buf) + + pto.tstore ins(%c : !pto.tile_buf) + outs(%c_part : !pto.partition_tensor_view<1x1x1x16x32xi8>) + return + } + + // Case 11: i16 77x96 LE with valid 32x32 (int16_77x96_32x32) +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcmps/CMakeLists.txt b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcmps/CMakeLists.txt new file mode 100644 index 000000000..5b766cc09 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcmps/CMakeLists.txt @@ -0,0 +1,9 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +pto_tilelang_vec_st(tcmps) diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcmps/cases.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcmps/cases.py new file mode 100644 index 000000000..53bfc215d --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcmps/cases.py @@ -0,0 +1,138 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +"""Single source of truth for tcmps ST test cases. + +tcmps: packed mask of (src < scalar), dst stores packed predicate mask. +Supports 32-bit source types: f32, i32. Output dtype is uint8. + +Cases reference testcase/tcmps with various shapes and valid regions. +""" + +import numpy as np + +CASES = [ + # float32 cases matching testcase/tcmps + { + "name": "f32_1x64", + "dtype": np.float32, + "out_dtype": np.uint8, + "shape": (1, 64), + "valid_shape": (1, 64), + "eps": 0, + }, + { + "name": "f32_4x64", + "dtype": np.float32, + "out_dtype": np.uint8, + "shape": (4, 64), + "valid_shape": (4, 64), + "eps": 0, + }, + { + "name": "f32_8x64", + "dtype": np.float32, + "out_dtype": np.uint8, + "shape": (8, 64), + "valid_shape": (8, 64), + "eps": 0, + }, + { + "name": "f32_32x64", + "dtype": np.float32, + "out_dtype": np.uint8, + "shape": (32, 64), + "valid_shape": (32, 64), + "eps": 0, + }, + { + "name": "f32_128x128", + "dtype": np.float32, + "out_dtype": np.uint8, + "shape": (128, 128), + "valid_shape": (128, 128), + "eps": 0, + }, + # int32 cases matching testcase/tcmps + { + "name": "i32_16x32", + "dtype": np.int32, + "out_dtype": np.uint8, + "shape": (16, 32), + "valid_shape": (16, 32), + "eps": 0, + }, + { + "name": "i32_32x32", + "dtype": np.int32, + "out_dtype": np.uint8, + "shape": (32, 32), + "valid_shape": (32, 32), + "eps": 0, + }, + { + "name": "i32_32x64_valid32x64", + "dtype": np.int32, + "out_dtype": np.uint8, + "shape": (64, 64), + "valid_shape": (32, 64), + "eps": 0, + }, + # Non-aligned cases + { + "name": "f32_7x448", + "dtype": np.float32, + "out_dtype": np.uint8, + "shape": (7, 448), + "valid_shape": (7, 448), + "eps": 0, + }, + { + "name": "f32_256x16", + "dtype": np.float32, + "out_dtype": np.uint8, + "shape": (256, 16), + "valid_shape": (256, 16), + "eps": 0, + }, + { + "name": "i32_31x128", + "dtype": np.int32, + "out_dtype": np.uint8, + "shape": (31, 128), + "valid_shape": (31, 128), + "eps": 0, + }, + # 16B cases (f16, i16) + { + "name": "f16_32x128", + "dtype": np.float16, + "out_dtype": np.uint8, + "shape": (32, 128), + "valid_shape": (32, 128), + "eps": 0, + }, + { + "name": "i16_32x128", + "dtype": np.int16, + "out_dtype": np.uint8, + "shape": (32, 128), + "valid_shape": (32, 128), + "eps": 0, + }, +] + +_SMOKE_CASE_NAMES = ['f32_1x64', 'i32_16x32'] +_SMOKE_CASE_NAME_SET = set(_SMOKE_CASE_NAMES) +_missing = [name for name in _SMOKE_CASE_NAMES if name not in {case["name"] for case in CASES}] +if _missing: + raise RuntimeError("unknown smoke case(s): " + ", ".join(_missing)) +CASES = [case for case in CASES if case["name"] in _SMOKE_CASE_NAME_SET] diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcmps/compare.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcmps/compare.py new file mode 100644 index 000000000..d6106c48a --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcmps/compare.py @@ -0,0 +1,89 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import os +import sys +import numpy as np + +from cases import CASES +from st_common import result_cmp, style_fail, style_pass, validate_cases + + +def main(): + validate_cases(CASES) + case_filter = sys.argv[1] if len(sys.argv) > 1 else None + + all_passed = True + for case in CASES: + if case_filter is not None and case["name"] != case_filter: + continue + + case_dir = case["name"] + shape = case["shape"] + vr, vc = case["valid_shape"] + dtype = case["dtype"] + out_dtype = case["out_dtype"] + elem_size = np.dtype(dtype).itemsize + lanes = 256 // elem_size + + # Calculate expected output size (same as gen_data.py) + total_elm = vr * vc + if elem_size == 4: # 32B + bytes_per_iter = 16 + repeat_times = (total_elm + lanes - 1) // lanes + 1 + total_iters = repeat_times // 2 + expected_bytes = total_iters * bytes_per_iter + elif elem_size == 2: # 16B + bytes_per_iter = 16 + iters_per_row = (vc + lanes - 1) // lanes + expected_bytes = vr * iters_per_row * bytes_per_iter + else: # 8B + bytes_per_iter = 32 + iters_per_row = (vc + lanes - 1) // lanes + expected_bytes = vr * iters_per_row * bytes_per_iter + + # Read golden (already correct size from gen_data.py) + golden = np.fromfile(os.path.join(case_dir, "golden.bin"), dtype=np.uint8) + + # Read output and truncate/zero-pad to expected size + output = np.fromfile(os.path.join(case_dir, "output.bin"), dtype=np.uint8) + if len(output) > expected_bytes: + output = output[:expected_bytes] + elif len(output) < expected_bytes: + output = np.pad(output, (0, expected_bytes - len(output)), mode='constant') + + # Compare byte-by-byte + ok = np.array_equal(golden, output) + if not ok: + # Find first mismatch for debugging + diff_mask = golden != output + diff_indices = np.where(diff_mask)[0] + if len(diff_indices) > 0: + diff_idx = diff_indices[0] + max_diff = int(np.max(np.abs(golden.astype(int) - output.astype(int)))) + print(style_fail(f"[ERROR] Mismatch: max diff={max_diff} at byte idx={diff_idx} " + f"(golden=0x{golden[diff_idx]:02x}, output=0x{output[diff_idx]:02x})")) + else: + print(style_fail(f"[ERROR] Mismatch: shapes differ golden={golden.shape} output={output.shape}")) + + if ok: + print(style_pass(f"[INFO] {case['name']}: compare passed")) + else: + print(style_fail(f"[ERROR] {case['name']}: compare failed")) + all_passed = False + + if not all_passed: + sys.exit(2) + print(style_pass("[INFO] all cases passed")) + + +if __name__ == "__main__": + main() diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcmps/gen_data.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcmps/gen_data.py new file mode 100644 index 000000000..c4f46ab85 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcmps/gen_data.py @@ -0,0 +1,109 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import numpy as np +from cases import CASES +from st_common import validate_cases, setup_case_rng, save_case_data + +# Scalar value for comparison (matches the scalar passed in launch.cpp) +SCALAR = 5.0 + +validate_cases(CASES) + +for case in CASES: + setup_case_rng(case) + + dtype = case["dtype"] + out_dtype = case["out_dtype"] + shape = case["shape"] + valid_shape = case["valid_shape"] + + # Generate random input matching testcase/tcmps pattern + if np.issubdtype(dtype, np.floating): + input1 = np.random.randint(-5, 5, size=shape).astype(dtype) + else: + input1 = np.random.randint(1, 10, size=shape).astype(dtype) + + vr, vc = valid_shape + if np.issubdtype(dtype, np.floating): + scalar_val = dtype(SCALAR) + else: + scalar_val = dtype(int(SCALAR)) + + # Compute element-wise comparison result (0 or 1 per element) + # Using "lt" mode to match the template + cmp_result = (input1[:vr, :vc] < scalar_val).astype(np.uint8, copy=False) + + # tcmps output uses psts: + # - 32B: 64 elements -> 32 bytes (NORM mode, sequential, bit_pos = col_in_iter * 4) + # - 16B: 128 elements -> 16 bytes (PK mode, bit_pos = col_in_iter) + # - 8B: 256 elements -> 32 bytes (NORM mode, sequential, bit_pos = col_in_iter) + elem_size = np.dtype(dtype).itemsize + lanes = 256 // elem_size + if elem_size == 4: # 32B: 2 vcmps + dintlv_b8 -> PK mode (16 bytes per iteration) + bytes_per_iter = 16 + bit_multiplier = 1 + # For 32B, each iteration processes 2 repeats (128 elements) + # Element linear index maps to bit position after dintlv_b8 + elif elem_size == 2: # 16B: PK mode (16 bytes per iteration) + bytes_per_iter = 16 + bit_multiplier = 1 + else: # 8B: NORM mode (32 bytes per iteration) + bytes_per_iter = 32 + bit_multiplier = 1 + + # Calculate iterations (total) + total_elm = vr * vc + if elem_size == 4: # 32B: special handling for linear offset + repeat_times = (total_elm + lanes - 1) // lanes + 1 + total_iters = repeat_times // 2 + else: + iters_per_row = (vc + lanes - 1) // lanes + + total_elm = vr * vc + if elem_size == 4: # 32B: special handling for linear offset + repeat_times = (total_elm + lanes - 1) // lanes + 1 + total_iters = repeat_times // 2 + total_output_bytes = total_iters * bytes_per_iter + else: + iters_per_row = (vc + lanes - 1) // lanes + total_iters = vr * iters_per_row + total_output_bytes = total_iters * bytes_per_iter + + # Output buffer size matches actual output + golden = np.zeros(total_output_bytes, dtype=np.uint8) + + for row in range(vr): + for col in range(vc): + if cmp_result[row, col]: + if elem_size == 4: # 32B: PK mode after dintlv_b8 with linear offset + # Linear element index + linear_idx = row * vc + col + # Each iteration processes 128 elements (2 repeats of 64) + iter_idx = linear_idx // (2 * lanes) + # Position within the 128-element block + pos_in_block = linear_idx % (2 * lanes) + # PK mode: bit position = pos_in_block + bit_pos = pos_in_block + # Byte offset (linear) + byte_idx = iter_idx * bytes_per_iter + (bit_pos // 8) + bit_idx = bit_pos % 8 + else: # 16B and 8B + col_in_iter = col % lanes + bit_pos = col_in_iter * bit_multiplier + byte_idx = (row * iters_per_row + col // lanes) * bytes_per_iter + (bit_pos // 8) + bit_idx = bit_pos % 8 + + if byte_idx < total_output_bytes: + golden[byte_idx] |= (1 << bit_idx) + + save_case_data(case["name"], {"input1": input1, "golden": golden}) + print(f"[INFO] gen_data: {case['name']} shape={shape} valid_shape={valid_shape} dtype={dtype.__name__} out_dtype={out_dtype.__name__} scalar={SCALAR}") diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcmps/launch.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcmps/launch.cpp new file mode 100644 index 000000000..cbd8512f0 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcmps/launch.cpp @@ -0,0 +1,32 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +#include + +#ifndef AICORE +#define AICORE [aicore] +#endif + +// Scalar value for comparison (must match gen_data.py SCALAR) +static constexpr float TCMP_SCALAR_F32 = 5.0f; +static constexpr int32_t TCMP_SCALAR_I32 = 5; + +// Case 0: f32 1x64 + +extern "C" __global__ AICORE void TCMP_f32_1x64(__gm__ float *src, __gm__ uint8_t *dst, float scalar); +extern "C" __global__ AICORE void TCMP_i32_16x32(__gm__ int32_t *src, __gm__ uint8_t *dst, int32_t scalar); + +void LaunchTCMP_f32_1x64(float *src, uint8_t *dst, void *stream) { + TCMP_f32_1x64<<<1, nullptr, stream>>>((__gm__ float *)src, (__gm__ uint8_t *)dst, TCMP_SCALAR_F32); +} + + + +void LaunchTCMP_i32_16x32(int32_t *src, uint8_t *dst, void *stream) { + TCMP_i32_16x32<<<1, nullptr, stream>>>((__gm__ int32_t *)src, (__gm__ uint8_t *)dst, TCMP_SCALAR_I32); +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcmps/main.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcmps/main.cpp new file mode 100644 index 000000000..ab9a328a6 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcmps/main.cpp @@ -0,0 +1,139 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// Host driver for TileLang tcmps ST — case-table driven. +// tcmps: dst = packed mask of (src < scalar). +// Numerical comparison is done externally by compare.py. + +#include "acl/acl.h" +#include "test_common.h" +#include +#include +#include +#include +#include +#include + +using namespace PtoTestCommon; + +// Kernel launch wrappers (defined in launch.cpp) +void LaunchTCMP_f32_1x64(float *src, uint8_t *dst, void *stream); +void LaunchTCMP_f32_4x64(float *src, uint8_t *dst, void *stream); +void LaunchTCMP_f32_32x64(float *src, uint8_t *dst, void *stream); +void LaunchTCMP_i32_16x32(int32_t *src, uint8_t *dst, void *stream); +void LaunchTCMP_i32_32x64_valid32x64(int32_t *src, uint8_t *dst, void *stream); +void LaunchTCMP_f32_256x16(float *src, uint8_t *dst, void *stream); +void LaunchTCMP_f16_32x128(uint16_t *src, uint8_t *dst, void *stream); + +struct TestCase { + const char *name; + void (*launch)(void *, void *, void *); // src, dst, stream + size_t rows; // allocated tile rows + size_t cols; // allocated tile cols + size_t validRows; // effective computation rows (<= rows) + size_t validCols; // effective computation cols (<= cols) + size_t srcElemSize; // bytes per source element + size_t dstElemSize; // bytes per destination element +}; + +static const TestCase kCases[] = { +{"f32_1x64", (void (*)(void*,void*,void*))LaunchTCMP_f32_1x64, 1, 64, 1, 64, sizeof(float), sizeof(uint8_t)}, +{"i32_16x32", (void (*)(void*,void*,void*))LaunchTCMP_i32_16x32, 16, 32, 16, 32, sizeof(int32_t), sizeof(uint8_t)}, +}; +static constexpr size_t kNumCases = sizeof(kCases) / sizeof(kCases[0]); + +static int RunCase(const TestCase &tc, int deviceId, aclrtStream stream) { + int rc = 0; + const size_t srcElemCount = tc.rows * tc.cols; + const size_t dstElemCount = tc.rows * tc.cols; + const size_t srcFileSize = srcElemCount * tc.srcElemSize; + const size_t dstFileSize = dstElemCount * tc.dstElemSize; + + std::printf("[INFO] === case: %s (shape=%zux%zu, valid=%zux%zu) ===\n", + tc.name, tc.rows, tc.cols, tc.validRows, tc.validCols); + + // Per-case data directory + std::string caseDir = std::string("./") + tc.name; + size_t inputFileSize = srcFileSize; + + void *srcHost = nullptr, *dstHost = nullptr; + void *srcDevice = nullptr, *dstDevice = nullptr; + + aclrtMallocHost(&srcHost, srcFileSize); + aclrtMallocHost(&dstHost, dstFileSize); + + aclrtMalloc(&srcDevice, srcFileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc(&dstDevice, dstFileSize, ACL_MEM_MALLOC_HUGE_FIRST); + + if (!ReadFile((caseDir + "/input1.bin").c_str(), inputFileSize, srcHost, srcFileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input1.bin\n", caseDir.c_str()); + rc = 1; + } + + if (rc == 0) { + aclrtMemcpy(srcDevice, srcFileSize, srcHost, srcFileSize, ACL_MEMCPY_HOST_TO_DEVICE); + + tc.launch(srcDevice, dstDevice, stream); + + aclrtSynchronizeStream(stream); + aclrtMemcpy(dstHost, dstFileSize, dstDevice, dstFileSize, ACL_MEMCPY_DEVICE_TO_HOST); + } + + if (rc == 0 && !WriteFile((caseDir + "/output.bin").c_str(), dstHost, dstFileSize)) { + std::fprintf(stderr, "[ERROR] failed to write %s/output.bin\n", caseDir.c_str()); + rc = 1; + } + + if (srcDevice != nullptr) + aclrtFree(srcDevice); + if (dstDevice != nullptr) + aclrtFree(dstDevice); + if (srcHost != nullptr) + aclrtFreeHost(srcHost); + if (dstHost != nullptr) + aclrtFreeHost(dstHost); + + if (rc == 0) + std::printf("[INFO] case %s done\n", tc.name); + return rc; +} + +int main(int argc, char *argv[]) { + // Optional case filter: ./tcmps [case_name] + const char *caseFilter = (argc > 1) ? argv[1] : nullptr; + + int rc = 0; + int deviceId = 0; + aclrtStream stream = nullptr; + + aclInit(nullptr); + if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { + deviceId = std::atoi(envDevice); + } + aclrtSetDevice(deviceId); + aclrtCreateStream(&stream); + + for (size_t i = 0; i < kNumCases; ++i) { + if (caseFilter != nullptr && std::strcmp(kCases[i].name, caseFilter) != 0) { + continue; + } + int ret = RunCase(kCases[i], deviceId, stream); + if (ret != 0) { + std::fprintf(stderr, "[ERROR] case %s failed\n", kCases[i].name); + rc = 1; + break; + } + } + + if (stream != nullptr) + aclrtDestroyStream(stream); + aclrtResetDevice(deviceId); + aclFinalize(); + + return rc; +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcmps/tcmps.pto b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcmps/tcmps.pto new file mode 100644 index 000000000..15e2970f4 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcmps/tcmps.pto @@ -0,0 +1,98 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// TileLang ST kernels for pto.tcmps: tload(src) + tcmps(src, scalar {cmpMode=lt})->dst + tstore(dst). +// Packed mask of (src < scalar), output stored as packed predicate mask (uint8). +// Supports 32-bit source types: f32, i32. +// Compiled by ptoas --enable-insert-sync --enable-tile-op-expand --pto-backend=vpto +// to produce a fatobj object. + +module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { + + // Case 0: f32 1x64 (64 elements) + func.func @TCMP_f32_1x64(%src_ptr: !pto.ptr, %dst_ptr: !pto.ptr, %scalar: f32) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c64 = arith.constant 64 : index + %c64i = arith.constant 64 : index + + %src_view = pto.make_tensor_view %src_ptr, + shape = [%c1, %c1, %c1, %c1, %c64], + strides = [%c64, %c64, %c64, %c64, %c1] + : !pto.tensor_view<1x1x1x1x64xf32> + %dst_view = pto.make_tensor_view %dst_ptr, + shape = [%c1, %c1, %c1, %c1, %c64], + strides = [%c64, %c64, %c64, %c64, %c1] + : !pto.tensor_view<1x1x1x1x64xui8> + + %src_part = pto.partition_view %src_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c1, %c64] + : !pto.tensor_view<1x1x1x1x64xf32> -> !pto.partition_tensor_view<1x1x1x1x64xf32> + %dst_part = pto.partition_view %dst_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c1, %c64] + : !pto.tensor_view<1x1x1x1x64xui8> -> !pto.partition_tensor_view<1x1x1x1x64xui8> + + %src = pto.alloc_tile + : !pto.tile_buf + %dst = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%src_part : !pto.partition_tensor_view<1x1x1x1x64xf32>) + outs(%src : !pto.tile_buf) + pto.tcmps ins(%src, %scalar {cmpMode = #pto} : !pto.tile_buf, f32) + outs(%dst : !pto.tile_buf) + pto.tstore ins(%dst : !pto.tile_buf) + outs(%dst_part : !pto.partition_tensor_view<1x1x1x1x64xui8>) + return + } + + // Case 1: f32 4x64 (256 elements) + + func.func @TCMP_i32_16x32(%src_ptr: !pto.ptr, %dst_ptr: !pto.ptr, %scalar: i32) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c16 = arith.constant 16 : index + %c32 = arith.constant 32 : index + %c512 = arith.constant 512 : index + + %src_view = pto.make_tensor_view %src_ptr, + shape = [%c1, %c1, %c1, %c16, %c32], + strides = [%c512, %c512, %c512, %c32, %c1] + : !pto.tensor_view<1x1x1x16x32xi32> + %dst_view = pto.make_tensor_view %dst_ptr, + shape = [%c1, %c1, %c1, %c16, %c32], + strides = [%c512, %c512, %c512, %c32, %c1] + : !pto.tensor_view<1x1x1x16x32xui8> + + %src_part = pto.partition_view %src_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c16, %c32] + : !pto.tensor_view<1x1x1x16x32xi32> -> !pto.partition_tensor_view<1x1x1x16x32xi32> + %dst_part = pto.partition_view %dst_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c16, %c32] + : !pto.tensor_view<1x1x1x16x32xui8> -> !pto.partition_tensor_view<1x1x1x16x32xui8> + + %src = pto.alloc_tile + : !pto.tile_buf + %dst = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%src_part : !pto.partition_tensor_view<1x1x1x16x32xi32>) + outs(%src : !pto.tile_buf) + pto.tcmps ins(%src, %scalar {cmpMode = #pto} : !pto.tile_buf, i32) + outs(%dst : !pto.tile_buf) + pto.tstore ins(%dst : !pto.tile_buf) + outs(%dst_part : !pto.partition_tensor_view<1x1x1x16x32xui8>) + return + } + + // Case 6: i32 32x32 (1024 elements) +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolargmax/CMakeLists.txt b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolargmax/CMakeLists.txt new file mode 100644 index 000000000..281de55f6 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolargmax/CMakeLists.txt @@ -0,0 +1,9 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +pto_tilelang_vec_st(tcolargmax) diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolargmax/cases.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolargmax/cases.py new file mode 100644 index 000000000..202c815e7 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolargmax/cases.py @@ -0,0 +1,217 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +"""Single source of truth for tcolargmax ST test cases. + +Each case defines: + - name: case identifier, used as subdirectory name and by main.cpp kCases[]. + - dtype: numpy dtype for input data (e.g. np.float32). + - shape: (rows, cols) — allocated tile dimensions for input (src and tmp). + - valid_shape: (valid_rows, valid_cols) — effective computation region for input (src and tmp). + - dst_shape: (1, cols) — allocated tile dimensions for output (indices). + - dst_valid_shape: (1, valid_cols) — effective computation region for output (indices). + - dst_dtype: numpy dtype for output indices (np.int32). + - eps: tolerance for numpy.allclose (atol and rtol). + +gen_data.py and compare.py both import this list to avoid redundant definitions. +""" + +import numpy as np + +CASES = [ + { + "name": "f32_1x256", + "dtype": np.float32, + "shape": (1, 256), + "valid_shape": (1, 255), + "dst_shape": (1, 256), + "dst_valid_shape": (1, 255), + "dst_dtype": np.int32, + "eps": 0, + }, + { + "name": "f32_16x128", + "dtype": np.float32, + "shape": (16, 128), + "valid_shape": (16, 127), + "dst_shape": (1, 128), + "dst_valid_shape": (1, 127), + "dst_dtype": np.int32, + "eps": 0, + }, + { + "name": "f32_16x256", + "dtype": np.float32, + "shape": (16, 256), + "valid_shape": (15, 255), + "dst_shape": (1, 256), + "dst_valid_shape": (1, 255), + "dst_dtype": np.int32, + "eps": 0, + }, + { + "name": "f16_1x256", + "dtype": np.float16, + "shape": (1, 256), + "valid_shape": (1, 255), + "dst_shape": (1, 256), + "dst_valid_shape": (1, 255), + "dst_dtype": np.int32, + "eps": 0, + }, + { + "name": "f16_16x128", + "dtype": np.float16, + "shape": (16, 128), + "valid_shape": (16, 127), + "dst_shape": (1, 128), + "dst_valid_shape": (1, 127), + "dst_dtype": np.int32, + "eps": 0, + }, + { + "name": "f16_16x256", + "dtype": np.float16, + "shape": (16, 256), + "valid_shape": (15, 255), + "dst_shape": (1, 256), + "dst_valid_shape": (1, 255), + "dst_dtype": np.int32, + "eps": 0, + }, + { + "name": "ui32_1x256", + "dtype": np.uint32, + "shape": (1, 256), + "valid_shape": (1, 255), + "dst_shape": (1, 256), + "dst_valid_shape": (1, 255), + "dst_dtype": np.int32, + "eps": 0, + }, + { + "name": "ui32_16x128", + "dtype": np.uint32, + "shape": (16, 128), + "valid_shape": (16, 127), + "dst_shape": (1, 128), + "dst_valid_shape": (1, 127), + "dst_dtype": np.int32, + "eps": 0, + }, + { + "name": "ui32_16x256", + "dtype": np.uint32, + "shape": (16, 256), + "valid_shape": (15, 255), + "dst_shape": (1, 256), + "dst_valid_shape": (1, 255), + "dst_dtype": np.int32, + "eps": 0, + }, + { + "name": "ui16_1x256", + "dtype": np.uint16, + "shape": (1, 256), + "valid_shape": (1, 255), + "dst_shape": (1, 256), + "dst_valid_shape": (1, 255), + "dst_dtype": np.int32, + "eps": 0, + }, + { + "name": "ui16_16x128", + "dtype": np.uint16, + "shape": (16, 128), + "valid_shape": (16, 127), + "dst_shape": (1, 128), + "dst_valid_shape": (1, 127), + "dst_dtype": np.int32, + "eps": 0, + }, + { + "name": "ui16_16x256", + "dtype": np.uint16, + "shape": (16, 256), + "valid_shape": (15, 255), + "dst_shape": (1, 256), + "dst_valid_shape": (1, 255), + "dst_dtype": np.int32, + "eps": 0, + }, + { + "name": "ui8_1x256", + "dtype": np.uint8, + "shape": (1, 256), + "valid_shape": (1, 255), + "dst_shape": (1, 256), + "dst_valid_shape": (1, 255), + "dst_dtype": np.int32, + "eps": 0, + }, + { + "name": "ui8_16x128", + "dtype": np.uint8, + "shape": (16, 128), + "valid_shape": (16, 127), + "dst_shape": (1, 128), + "dst_valid_shape": (1, 127), + "dst_dtype": np.int32, + "eps": 0, + }, + { + "name": "ui8_16x256", + "dtype": np.uint8, + "shape": (16, 256), + "valid_shape": (15, 255), + "dst_shape": (1, 256), + "dst_valid_shape": (1, 255), + "dst_dtype": np.int32, + "eps": 0, + }, + { + "name": "i8_1x256", + "dtype": np.int8, + "shape": (1, 256), + "valid_shape": (1, 255), + "dst_shape": (1, 256), + "dst_valid_shape": (1, 255), + "dst_dtype": np.int32, + "eps": 0, + }, + { + "name": "i8_16x128", + "dtype": np.int8, + "shape": (16, 128), + "valid_shape": (16, 127), + "dst_shape": (1, 128), + "dst_valid_shape": (1, 127), + "dst_dtype": np.int32, + "eps": 0, + }, + { + "name": "i8_16x256", + "dtype": np.int8, + "shape": (16, 256), + "valid_shape": (15, 255), + "dst_shape": (1, 256), + "dst_valid_shape": (1, 255), + "dst_dtype": np.int32, + "eps": 0, + }, +] + +_SMOKE_CASE_NAMES = ['f32_1x256', 'f16_1x256'] +_SMOKE_CASE_NAME_SET = set(_SMOKE_CASE_NAMES) +_missing = [name for name in _SMOKE_CASE_NAMES if name not in {case["name"] for case in CASES}] +if _missing: + raise RuntimeError("unknown smoke case(s): " + ", ".join(_missing)) +CASES = [case for case in CASES if case["name"] in _SMOKE_CASE_NAME_SET] diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolargmax/compare.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolargmax/compare.py new file mode 100644 index 000000000..8d67ad7c6 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolargmax/compare.py @@ -0,0 +1,51 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import os +import sys +import numpy as np + +from cases import CASES +from st_common import result_cmp, style_fail, style_pass, validate_cases + + +def main(): + validate_cases(CASES) + case_filter = sys.argv[1] if len(sys.argv) > 1 else None + + all_passed = True + for case in CASES: + if case_filter is not None and case["name"] != case_filter: + continue + + case_dir = case["name"] + dst_shape = case["dst_shape"] + dst_valid_shape = case["dst_valid_shape"] + dst_dtype = case["dst_dtype"] + vr, vc = dst_valid_shape + + golden = np.fromfile(os.path.join(case_dir, "golden.bin"), dtype=dst_dtype).reshape(dst_shape) + output = np.fromfile(os.path.join(case_dir, "output.bin"), dtype=dst_dtype).reshape(dst_shape) + + ok = result_cmp(golden[:vr, :vc], output[:vr, :vc], case["eps"]) + if ok: + print(style_pass(f"[INFO] {case['name']}: compare passed")) + else: + print(style_fail(f"[ERROR] {case['name']}: compare failed")) + all_passed = False + + if not all_passed: + sys.exit(2) + print(style_pass("[INFO] all cases passed")) + + +if __name__ == "__main__": + main() diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolargmax/gen_data.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolargmax/gen_data.py new file mode 100644 index 000000000..22c741344 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolargmax/gen_data.py @@ -0,0 +1,36 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import numpy as np +from cases import CASES +from st_common import validate_cases, setup_case_rng, save_case_data + +validate_cases(CASES) + +for case in CASES: + setup_case_rng(case) + + dtype = case["dtype"] + shape = case["shape"] + valid_shape = case["valid_shape"] + dst_shape = case["dst_shape"] + dst_valid_shape = case["dst_valid_shape"] + dst_dtype = case["dst_dtype"] + + input1 = np.random.randint(1, 10, size=shape).astype(dtype) + + vr, vc = valid_shape + golden = np.zeros(dst_shape, dtype=dst_dtype) + golden_result = np.argmax(input1[:vr, :vc], axis=0, keepdims=True).astype(dst_dtype) + golden[:1, :vc] = golden_result + + save_case_data(case["name"], {"input": input1, "golden": golden}) + print(f"[INFO] gen_data: {case['name']} shape={shape} valid_shape={valid_shape} dst_shape={dst_shape} dtype={dtype.__name__}") diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolargmax/launch.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolargmax/launch.cpp new file mode 100644 index 000000000..8956ae967 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolargmax/launch.cpp @@ -0,0 +1,28 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +#include + +#ifndef AICORE +#define AICORE [aicore] +#endif + +// Case 0: f32 1x256 (input: 1x256, tmp: 1x256, output: 1x256 indices) + +extern "C" __global__ AICORE void TCOLARGMAX_f32_1x256(__gm__ int32_t *dst, __gm__ float *tmp, __gm__ float *src); +extern "C" __global__ AICORE void TCOLARGMAX_f16_1x256(__gm__ int32_t *dst, __gm__ half *tmp, __gm__ half *src); + +void LaunchTCOLARGMAX_f16_1x256(void *dst, void *tmp, void *src, void *stream) { + TCOLARGMAX_f16_1x256<<<1, nullptr, stream>>>((__gm__ int32_t *)dst, (__gm__ half *)tmp, (__gm__ half *)src); +} + + + +void LaunchTCOLARGMAX_f32_1x256(int32_t *dst, float *tmp, float *src, void *stream) { + TCOLARGMAX_f32_1x256<<<1, nullptr, stream>>>((__gm__ int32_t *)dst, (__gm__ float *)tmp, (__gm__ float *)src); +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolargmax/main.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolargmax/main.cpp new file mode 100644 index 000000000..1034565d3 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolargmax/main.cpp @@ -0,0 +1,171 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// Host driver for TileLang tcolargmax ST — case-table driven. +// Each case launches a different kernel variant, reads/writes from per-case subdirectory. +// Numerical comparison is done externally by compare.py. + +#include "acl/acl.h" +#include "test_common.h" +#include +#include +#include +#include +#include +#include + +using namespace PtoTestCommon; + +// Kernel launch wrappers (defined in launch.cpp) +void LaunchTCOLARGMAX_f32_1x256(int32_t *dst, float *tmp, float *src, void *stream); +void LaunchTCOLARGMAX_f32_16x128(int32_t *dst, float *tmp, float *src, void *stream); +void LaunchTCOLARGMAX_f16_1x256(void *dst, void *tmp, void *src, void *stream); +void LaunchTCOLARGMAX_f16_16x256(void *dst, void *tmp, void *src, void *stream); +void LaunchTCOLARGMAX_ui32_16x128(void *dst, void *tmp, void *src, void *stream); +void LaunchTCOLARGMAX_ui16_1x256(void *dst, void *tmp, void *src, void *stream); +void LaunchTCOLARGMAX_ui16_16x256(void *dst, void *tmp, void *src, void *stream); +void LaunchTCOLARGMAX_ui8_16x128(void *dst, void *tmp, void *src, void *stream); +void LaunchTCOLARGMAX_i8_1x256(void *dst, void *tmp, void *src, void *stream); +void LaunchTCOLARGMAX_i8_16x256(void *dst, void *tmp, void *src, void *stream); + +using LaunchFnFloat = void (*)(int32_t *, float *, float *, void *); +using LaunchFnVoid = void (*)(void *, void *, void *, void *); + +struct TestCase { + const char *name; + void *launch; + size_t srcRows; + size_t srcCols; + size_t srcValidRows; + size_t srcValidCols; + size_t tmpRows; + size_t tmpCols; + size_t tmpValidRows; + size_t tmpValidCols; + size_t dstRows; + size_t dstCols; + size_t dstValidCols; + size_t srcElemSize; + size_t dstElemSize; + bool isFp16; + bool isUi32; + bool isUi16; + bool isUi8; + bool isI8; +}; + +static const TestCase kCases[] = { +{"f32_1x256", (void*)LaunchTCOLARGMAX_f32_1x256, 1, 256, 1, 255, 1, 256, 1, 255, 1, 256, 255, sizeof(float), sizeof(int32_t), false, false, false, false, false}, +{"f16_1x256", (void*)LaunchTCOLARGMAX_f16_1x256, 1, 256, 1, 255, 1, 256, 1, 255, 1, 256, 255, 2, sizeof(int32_t), true, false, false, false, false}, +}; +static constexpr size_t kNumCases = sizeof(kCases) / sizeof(kCases[0]); + +static int RunCase(const TestCase &tc, int deviceId, aclrtStream stream) { + int rc = 0; + const size_t srcElemCount = tc.srcRows * tc.srcCols; + const size_t srcFileSize = srcElemCount * tc.srcElemSize; + const size_t tmpElemCount = tc.tmpRows * tc.tmpCols; + const size_t tmpFileSize = tmpElemCount * tc.srcElemSize; + const size_t dstElemCount = tc.dstRows * tc.dstCols; + const size_t dstFileSize = dstElemCount * tc.dstElemSize; + + std::printf("[INFO] === case: %s (src=%zux%zu, tmp=%zux%zu, dst=%zux%zu, fp16=%d) ===\n", + tc.name, tc.srcRows, tc.srcCols, tc.tmpRows, tc.tmpCols, tc.dstRows, tc.dstCols, tc.isFp16); + + std::string caseDir = std::string("./") + tc.name; + size_t srcFileSizeVar = srcFileSize; + size_t tmpFileSizeVar = tmpFileSize; + size_t dstFileSizeVar = dstFileSize; + + void *srcHost = nullptr, *tmpHost = nullptr, *dstHost = nullptr; + void *srcDevice = nullptr, *tmpDevice = nullptr, *dstDevice = nullptr; + + aclrtMallocHost(&srcHost, srcFileSize); + aclrtMallocHost(&tmpHost, tmpFileSize); + aclrtMallocHost(&dstHost, dstFileSize); + + aclrtMalloc(&srcDevice, srcFileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc(&tmpDevice, tmpFileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc(&dstDevice, dstFileSize, ACL_MEM_MALLOC_HUGE_FIRST); + + if (!ReadFile((caseDir + "/input.bin").c_str(), srcFileSizeVar, srcHost, srcFileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input.bin\n", caseDir.c_str()); + rc = 1; + } + + if (rc == 0) { + aclrtMemcpy(srcDevice, srcFileSize, srcHost, srcFileSize, ACL_MEMCPY_HOST_TO_DEVICE); + + if (tc.isFp16 || tc.isUi32 || tc.isUi16 || tc.isUi8 || tc.isI8) { + LaunchFnVoid launch = (LaunchFnVoid)tc.launch; + launch(dstDevice, tmpDevice, srcDevice, stream); + } else { + LaunchFnFloat launch = (LaunchFnFloat)tc.launch; + launch((int32_t*)dstDevice, (float*)tmpDevice, (float*)srcDevice, stream); + } + + aclrtSynchronizeStream(stream); + aclrtMemcpy(dstHost, dstFileSize, dstDevice, dstFileSize, ACL_MEMCPY_DEVICE_TO_HOST); + } + + if (rc == 0 && !WriteFile((caseDir + "/output.bin").c_str(), dstHost, dstFileSizeVar)) { + std::fprintf(stderr, "[ERROR] failed to write %s/output.bin\n", caseDir.c_str()); + rc = 1; + } + + if (srcDevice != nullptr) + aclrtFree(srcDevice); + if (tmpDevice != nullptr) + aclrtFree(tmpDevice); + if (dstDevice != nullptr) + aclrtFree(dstDevice); + if (srcHost != nullptr) + aclrtFreeHost(srcHost); + if (tmpHost != nullptr) + aclrtFreeHost(tmpHost); + if (dstHost != nullptr) + aclrtFreeHost(dstHost); + + if (rc == 0) + std::printf("[INFO] case %s done\n", tc.name); + return rc; +} + +int main(int argc, char *argv[]) { + const char *caseFilter = (argc > 1) ? argv[1] : nullptr; + + int rc = 0; + int deviceId = 0; + aclrtStream stream = nullptr; + + aclInit(nullptr); + if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { + deviceId = std::atoi(envDevice); + } + aclrtSetDevice(deviceId); + aclrtCreateStream(&stream); + + for (size_t i = 0; i < kNumCases; ++i) { + if (caseFilter != nullptr && std::strcmp(kCases[i].name, caseFilter) != 0) { + continue; + } + int ret = RunCase(kCases[i], deviceId, stream); + if (ret != 0) { + std::fprintf(stderr, "[ERROR] case %s failed\n", kCases[i].name); + rc = 1; + break; + } + } + + if (stream != nullptr) + aclrtDestroyStream(stream); + aclrtResetDevice(deviceId); + aclFinalize(); + + return rc; +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolargmax/tcolargmax.pto b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolargmax/tcolargmax.pto new file mode 100644 index 000000000..f48cdc10d --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolargmax/tcolargmax.pto @@ -0,0 +1,115 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// TileLang ST kernels for pto.tcolargmax: tload(src) + tcolargmax(src, tmp, dst) + tstore(dst). +// Multiple cases with different shapes in a single module. +// Compiled by ptoas --enable-insert-sync --enable-tile-op-expand --pto-backend=vpto +// to produce a fatobj object. + +module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { +// Case 0: f32 1x256 (input: 1x256, tmp: 1x256, output: 1x256 indices) + func.func @TCOLARGMAX_f32_1x256(%dst_ptr: !pto.ptr, %tmp_ptr: !pto.ptr, %src_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c256 = arith.constant 256 : index + %c255 = arith.constant 255 : index + + %src_view = pto.make_tensor_view %src_ptr, + shape = [%c1, %c1, %c1, %c1, %c256], + strides = [%c256, %c256, %c256, %c256, %c1] + : !pto.tensor_view<1x1x1x1x256xf32> + %tmp_view = pto.make_tensor_view %tmp_ptr, + shape = [%c1, %c1, %c1, %c1, %c256], + strides = [%c256, %c256, %c256, %c256, %c1] + : !pto.tensor_view<1x1x1x1x256xf32> + %dst_view = pto.make_tensor_view %dst_ptr, + shape = [%c1, %c1, %c1, %c1, %c256], + strides = [%c256, %c256, %c256, %c256, %c1] + : !pto.tensor_view<1x1x1x1x256xi32> + + %src_part = pto.partition_view %src_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c1, %c255] + : !pto.tensor_view<1x1x1x1x256xf32> -> !pto.partition_tensor_view<1x1x1x1x255xf32> + %tmp_part = pto.partition_view %tmp_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c1, %c255] + : !pto.tensor_view<1x1x1x1x256xf32> -> !pto.partition_tensor_view<1x1x1x1x255xf32> + %dst_part = pto.partition_view %dst_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c1, %c255] + : !pto.tensor_view<1x1x1x1x256xi32> -> !pto.partition_tensor_view<1x1x1x1x255xi32> + + %src = pto.alloc_tile : !pto.tile_buf + %tmp = pto.alloc_tile : !pto.tile_buf + %dst = pto.alloc_tile : !pto.tile_buf + + pto.tload ins(%src_part : !pto.partition_tensor_view<1x1x1x1x255xf32>) + outs(%src : !pto.tile_buf) + + pto.tcolargmax ins(%src, %tmp : !pto.tile_buf, + !pto.tile_buf) + outs(%dst : !pto.tile_buf) + + pto.tstore ins(%dst : !pto.tile_buf) + outs(%dst_part : !pto.partition_tensor_view<1x1x1x1x255xi32>) + return + } + + // Case 1: f32 16x128 (input: 16x128, tmp: 16x128, output: 1x128 indices) + + func.func @TCOLARGMAX_f16_1x256(%dst_ptr: !pto.ptr, %tmp_ptr: !pto.ptr, %src_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c256 = arith.constant 256 : index + %c255 = arith.constant 255 : index + + %src_view = pto.make_tensor_view %src_ptr, + shape = [%c1, %c1, %c1, %c1, %c256], + strides = [%c256, %c256, %c256, %c256, %c1] + : !pto.tensor_view<1x1x1x1x256xf16> + %tmp_view = pto.make_tensor_view %tmp_ptr, + shape = [%c1, %c1, %c1, %c1, %c256], + strides = [%c256, %c256, %c256, %c256, %c1] + : !pto.tensor_view<1x1x1x1x256xf16> + %dst_view = pto.make_tensor_view %dst_ptr, + shape = [%c1, %c1, %c1, %c1, %c256], + strides = [%c256, %c256, %c256, %c256, %c1] + : !pto.tensor_view<1x1x1x1x256xi32> + + %src_part = pto.partition_view %src_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c1, %c255] + : !pto.tensor_view<1x1x1x1x256xf16> -> !pto.partition_tensor_view<1x1x1x1x255xf16> + %tmp_part = pto.partition_view %tmp_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c1, %c255] + : !pto.tensor_view<1x1x1x1x256xf16> -> !pto.partition_tensor_view<1x1x1x1x255xf16> + %dst_part = pto.partition_view %dst_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c1, %c255] + : !pto.tensor_view<1x1x1x1x256xi32> -> !pto.partition_tensor_view<1x1x1x1x255xi32> + + %src = pto.alloc_tile : !pto.tile_buf + %tmp = pto.alloc_tile : !pto.tile_buf + %dst = pto.alloc_tile : !pto.tile_buf + + pto.tload ins(%src_part : !pto.partition_tensor_view<1x1x1x1x255xf16>) + outs(%src : !pto.tile_buf) + + pto.tcolargmax ins(%src, %tmp : !pto.tile_buf, + !pto.tile_buf) + outs(%dst : !pto.tile_buf) + + pto.tstore ins(%dst : !pto.tile_buf) + outs(%dst_part : !pto.partition_tensor_view<1x1x1x1x255xi32>) + return + } + + // Case 4: f16 16x128 (input: 16x128, tmp: 16x128, output: 1x128 indices) +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolargmin/CMakeLists.txt b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolargmin/CMakeLists.txt new file mode 100644 index 000000000..efdef88ac --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolargmin/CMakeLists.txt @@ -0,0 +1,9 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +pto_tilelang_vec_st(tcolargmin) diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolargmin/cases.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolargmin/cases.py new file mode 100644 index 000000000..b5400428c --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolargmin/cases.py @@ -0,0 +1,217 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +"""Single source of truth for tcolargmin ST test cases. + +Each case defines: + - name: case identifier, used as subdirectory name and by main.cpp kCases[]. + - dtype: numpy dtype for input data (e.g. np.float32). + - shape: (rows, cols) — allocated tile dimensions for input (src and tmp). + - valid_shape: (valid_rows, valid_cols) — effective computation region for input (src and tmp). + - dst_shape: (1, cols) — allocated tile dimensions for output (indices). + - dst_valid_shape: (1, valid_cols) — effective computation region for output (indices). + - dst_dtype: numpy dtype for output indices (np.int32). + - eps: tolerance for numpy.allclose (atol and rtol). + +gen_data.py and compare.py both import this list to avoid redundant definitions. +""" + +import numpy as np + +CASES = [ + { + "name": "f32_1x256", + "dtype": np.float32, + "shape": (1, 256), + "valid_shape": (1, 255), + "dst_shape": (1, 256), + "dst_valid_shape": (1, 255), + "dst_dtype": np.int32, + "eps": 0, + }, + { + "name": "f32_16x128", + "dtype": np.float32, + "shape": (16, 128), + "valid_shape": (16, 127), + "dst_shape": (1, 128), + "dst_valid_shape": (1, 127), + "dst_dtype": np.int32, + "eps": 0, + }, + { + "name": "f32_16x256", + "dtype": np.float32, + "shape": (16, 256), + "valid_shape": (15, 255), + "dst_shape": (1, 256), + "dst_valid_shape": (1, 255), + "dst_dtype": np.int32, + "eps": 0, + }, + { + "name": "f16_1x256", + "dtype": np.float16, + "shape": (1, 256), + "valid_shape": (1, 255), + "dst_shape": (1, 256), + "dst_valid_shape": (1, 255), + "dst_dtype": np.int32, + "eps": 0, + }, + { + "name": "f16_16x128", + "dtype": np.float16, + "shape": (16, 128), + "valid_shape": (16, 127), + "dst_shape": (1, 128), + "dst_valid_shape": (1, 127), + "dst_dtype": np.int32, + "eps": 0, + }, + { + "name": "f16_16x256", + "dtype": np.float16, + "shape": (16, 256), + "valid_shape": (15, 255), + "dst_shape": (1, 256), + "dst_valid_shape": (1, 255), + "dst_dtype": np.int32, + "eps": 0, + }, + { + "name": "ui32_1x256", + "dtype": np.uint32, + "shape": (1, 256), + "valid_shape": (1, 255), + "dst_shape": (1, 256), + "dst_valid_shape": (1, 255), + "dst_dtype": np.int32, + "eps": 0, + }, + { + "name": "ui32_16x128", + "dtype": np.uint32, + "shape": (16, 128), + "valid_shape": (16, 127), + "dst_shape": (1, 128), + "dst_valid_shape": (1, 127), + "dst_dtype": np.int32, + "eps": 0, + }, + { + "name": "ui32_16x256", + "dtype": np.uint32, + "shape": (16, 256), + "valid_shape": (15, 255), + "dst_shape": (1, 256), + "dst_valid_shape": (1, 255), + "dst_dtype": np.int32, + "eps": 0, + }, + { + "name": "ui16_1x256", + "dtype": np.uint16, + "shape": (1, 256), + "valid_shape": (1, 255), + "dst_shape": (1, 256), + "dst_valid_shape": (1, 255), + "dst_dtype": np.int32, + "eps": 0, + }, + { + "name": "ui16_16x128", + "dtype": np.uint16, + "shape": (16, 128), + "valid_shape": (16, 127), + "dst_shape": (1, 128), + "dst_valid_shape": (1, 127), + "dst_dtype": np.int32, + "eps": 0, + }, + { + "name": "ui16_16x256", + "dtype": np.uint16, + "shape": (16, 256), + "valid_shape": (15, 255), + "dst_shape": (1, 256), + "dst_valid_shape": (1, 255), + "dst_dtype": np.int32, + "eps": 0, + }, + { + "name": "ui8_1x256", + "dtype": np.uint8, + "shape": (1, 256), + "valid_shape": (1, 255), + "dst_shape": (1, 256), + "dst_valid_shape": (1, 255), + "dst_dtype": np.int32, + "eps": 0, + }, + { + "name": "ui8_16x128", + "dtype": np.uint8, + "shape": (16, 128), + "valid_shape": (16, 127), + "dst_shape": (1, 128), + "dst_valid_shape": (1, 127), + "dst_dtype": np.int32, + "eps": 0, + }, + { + "name": "ui8_16x256", + "dtype": np.uint8, + "shape": (16, 256), + "valid_shape": (15, 255), + "dst_shape": (1, 256), + "dst_valid_shape": (1, 255), + "dst_dtype": np.int32, + "eps": 0, + }, + { + "name": "i8_1x256", + "dtype": np.int8, + "shape": (1, 256), + "valid_shape": (1, 255), + "dst_shape": (1, 256), + "dst_valid_shape": (1, 255), + "dst_dtype": np.int32, + "eps": 0, + }, + { + "name": "i8_16x128", + "dtype": np.int8, + "shape": (16, 128), + "valid_shape": (16, 127), + "dst_shape": (1, 128), + "dst_valid_shape": (1, 127), + "dst_dtype": np.int32, + "eps": 0, + }, + { + "name": "i8_16x256", + "dtype": np.int8, + "shape": (16, 256), + "valid_shape": (15, 255), + "dst_shape": (1, 256), + "dst_valid_shape": (1, 255), + "dst_dtype": np.int32, + "eps": 0, + }, +] + +_SMOKE_CASE_NAMES = ['f32_1x256', 'f16_1x256'] +_SMOKE_CASE_NAME_SET = set(_SMOKE_CASE_NAMES) +_missing = [name for name in _SMOKE_CASE_NAMES if name not in {case["name"] for case in CASES}] +if _missing: + raise RuntimeError("unknown smoke case(s): " + ", ".join(_missing)) +CASES = [case for case in CASES if case["name"] in _SMOKE_CASE_NAME_SET] diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolargmin/compare.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolargmin/compare.py new file mode 100644 index 000000000..8d67ad7c6 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolargmin/compare.py @@ -0,0 +1,51 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import os +import sys +import numpy as np + +from cases import CASES +from st_common import result_cmp, style_fail, style_pass, validate_cases + + +def main(): + validate_cases(CASES) + case_filter = sys.argv[1] if len(sys.argv) > 1 else None + + all_passed = True + for case in CASES: + if case_filter is not None and case["name"] != case_filter: + continue + + case_dir = case["name"] + dst_shape = case["dst_shape"] + dst_valid_shape = case["dst_valid_shape"] + dst_dtype = case["dst_dtype"] + vr, vc = dst_valid_shape + + golden = np.fromfile(os.path.join(case_dir, "golden.bin"), dtype=dst_dtype).reshape(dst_shape) + output = np.fromfile(os.path.join(case_dir, "output.bin"), dtype=dst_dtype).reshape(dst_shape) + + ok = result_cmp(golden[:vr, :vc], output[:vr, :vc], case["eps"]) + if ok: + print(style_pass(f"[INFO] {case['name']}: compare passed")) + else: + print(style_fail(f"[ERROR] {case['name']}: compare failed")) + all_passed = False + + if not all_passed: + sys.exit(2) + print(style_pass("[INFO] all cases passed")) + + +if __name__ == "__main__": + main() diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolargmin/gen_data.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolargmin/gen_data.py new file mode 100644 index 000000000..fa6fcc478 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolargmin/gen_data.py @@ -0,0 +1,36 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import numpy as np +from cases import CASES +from st_common import validate_cases, setup_case_rng, save_case_data + +validate_cases(CASES) + +for case in CASES: + setup_case_rng(case) + + dtype = case["dtype"] + shape = case["shape"] + valid_shape = case["valid_shape"] + dst_shape = case["dst_shape"] + dst_valid_shape = case["dst_valid_shape"] + dst_dtype = case["dst_dtype"] + + input1 = np.random.randint(1, 10, size=shape).astype(dtype) + + vr, vc = valid_shape + golden = np.zeros(dst_shape, dtype=dst_dtype) + golden_result = np.argmin(input1[:vr, :vc], axis=0, keepdims=True).astype(dst_dtype) + golden[:1, :vc] = golden_result + + save_case_data(case["name"], {"input": input1, "golden": golden}) + print(f"[INFO] gen_data: {case['name']} shape={shape} valid_shape={valid_shape} dst_shape={dst_shape} dtype={dtype.__name__}") diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolargmin/launch.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolargmin/launch.cpp new file mode 100644 index 000000000..1d22bf2f7 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolargmin/launch.cpp @@ -0,0 +1,28 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +#include + +#ifndef AICORE +#define AICORE [aicore] +#endif + +// Case 0: f32 1x256 (input: 1x256, tmp: 1x256, output: 1x256 indices) + +extern "C" __global__ AICORE void TCOLARGMIN_f32_1x256(__gm__ int32_t *dst, __gm__ float *tmp, __gm__ float *src); +extern "C" __global__ AICORE void TCOLARGMIN_f16_1x256(__gm__ int32_t *dst, __gm__ half *tmp, __gm__ half *src); + +void LaunchTCOLARGMIN_f32_1x256(int32_t *dst, float *tmp, float *src, void *stream) { + TCOLARGMIN_f32_1x256<<<1, nullptr, stream>>>((__gm__ int32_t *)dst, (__gm__ float *)tmp, (__gm__ float *)src); +} + + + +void LaunchTCOLARGMIN_f16_1x256(void *dst, void *tmp, void *src, void *stream) { + TCOLARGMIN_f16_1x256<<<1, nullptr, stream>>>((__gm__ int32_t *)dst, (__gm__ half *)tmp, (__gm__ half *)src); +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolargmin/main.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolargmin/main.cpp new file mode 100644 index 000000000..195759464 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolargmin/main.cpp @@ -0,0 +1,171 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// Host driver for TileLang tcolargmin ST — case-table driven. +// Each case launches a different kernel variant, reads/writes from per-case subdirectory. +// Numerical comparison is done externally by compare.py. + +#include "acl/acl.h" +#include "test_common.h" +#include +#include +#include +#include +#include +#include + +using namespace PtoTestCommon; + +// Kernel launch wrappers (defined in launch.cpp) +void LaunchTCOLARGMIN_f32_1x256(int32_t *dst, float *tmp, float *src, void *stream); +void LaunchTCOLARGMIN_f32_16x128(int32_t *dst, float *tmp, float *src, void *stream); +void LaunchTCOLARGMIN_f16_1x256(void *dst, void *tmp, void *src, void *stream); +void LaunchTCOLARGMIN_f16_16x256(void *dst, void *tmp, void *src, void *stream); +void LaunchTCOLARGMIN_ui32_16x128(void *dst, void *tmp, void *src, void *stream); +void LaunchTCOLARGMIN_ui16_1x256(void *dst, void *tmp, void *src, void *stream); +void LaunchTCOLARGMIN_ui16_16x256(void *dst, void *tmp, void *src, void *stream); +void LaunchTCOLARGMIN_ui8_16x128(void *dst, void *tmp, void *src, void *stream); +void LaunchTCOLARGMIN_i8_1x256(void *dst, void *tmp, void *src, void *stream); +void LaunchTCOLARGMIN_i8_16x256(void *dst, void *tmp, void *src, void *stream); + +using LaunchFnFloat = void (*)(int32_t *, float *, float *, void *); +using LaunchFnVoid = void (*)(void *, void *, void *, void *); + +struct TestCase { + const char *name; + void *launch; + size_t srcRows; + size_t srcCols; + size_t srcValidRows; + size_t srcValidCols; + size_t tmpRows; + size_t tmpCols; + size_t tmpValidRows; + size_t tmpValidCols; + size_t dstRows; + size_t dstCols; + size_t dstValidCols; + size_t srcElemSize; + size_t dstElemSize; + bool isFp16; + bool isUi32; + bool isUi16; + bool isUi8; + bool isI8; +}; + +static const TestCase kCases[] = { +{"f32_1x256", (void*)LaunchTCOLARGMIN_f32_1x256, 1, 256, 1, 255, 1, 256, 1, 255, 1, 256, 255, sizeof(float), sizeof(int32_t), false, false, false, false, false}, +{"f16_1x256", (void*)LaunchTCOLARGMIN_f16_1x256, 1, 256, 1, 255, 1, 256, 1, 255, 1, 256, 255, 2, sizeof(int32_t), true, false, false, false, false}, +}; +static constexpr size_t kNumCases = sizeof(kCases) / sizeof(kCases[0]); + +static int RunCase(const TestCase &tc, int deviceId, aclrtStream stream) { + int rc = 0; + const size_t srcElemCount = tc.srcRows * tc.srcCols; + const size_t srcFileSize = srcElemCount * tc.srcElemSize; + const size_t tmpElemCount = tc.tmpRows * tc.tmpCols; + const size_t tmpFileSize = tmpElemCount * tc.srcElemSize; + const size_t dstElemCount = tc.dstRows * tc.dstCols; + const size_t dstFileSize = dstElemCount * tc.dstElemSize; + + std::printf("[INFO] === case: %s (src=%zux%zu, tmp=%zux%zu, dst=%zux%zu, fp16=%d) ===\n", + tc.name, tc.srcRows, tc.srcCols, tc.tmpRows, tc.tmpCols, tc.dstRows, tc.dstCols, tc.isFp16); + + std::string caseDir = std::string("./") + tc.name; + size_t srcFileSizeVar = srcFileSize; + size_t tmpFileSizeVar = tmpFileSize; + size_t dstFileSizeVar = dstFileSize; + + void *srcHost = nullptr, *tmpHost = nullptr, *dstHost = nullptr; + void *srcDevice = nullptr, *tmpDevice = nullptr, *dstDevice = nullptr; + + aclrtMallocHost(&srcHost, srcFileSize); + aclrtMallocHost(&tmpHost, tmpFileSize); + aclrtMallocHost(&dstHost, dstFileSize); + + aclrtMalloc(&srcDevice, srcFileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc(&tmpDevice, tmpFileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc(&dstDevice, dstFileSize, ACL_MEM_MALLOC_HUGE_FIRST); + + if (!ReadFile((caseDir + "/input.bin").c_str(), srcFileSizeVar, srcHost, srcFileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input.bin\n", caseDir.c_str()); + rc = 1; + } + + if (rc == 0) { + aclrtMemcpy(srcDevice, srcFileSize, srcHost, srcFileSize, ACL_MEMCPY_HOST_TO_DEVICE); + + if (tc.isFp16 || tc.isUi32 || tc.isUi16 || tc.isUi8 || tc.isI8) { + LaunchFnVoid launch = (LaunchFnVoid)tc.launch; + launch(dstDevice, tmpDevice, srcDevice, stream); + } else { + LaunchFnFloat launch = (LaunchFnFloat)tc.launch; + launch((int32_t*)dstDevice, (float*)tmpDevice, (float*)srcDevice, stream); + } + + aclrtSynchronizeStream(stream); + aclrtMemcpy(dstHost, dstFileSize, dstDevice, dstFileSize, ACL_MEMCPY_DEVICE_TO_HOST); + } + + if (rc == 0 && !WriteFile((caseDir + "/output.bin").c_str(), dstHost, dstFileSizeVar)) { + std::fprintf(stderr, "[ERROR] failed to write %s/output.bin\n", caseDir.c_str()); + rc = 1; + } + + if (srcDevice != nullptr) + aclrtFree(srcDevice); + if (tmpDevice != nullptr) + aclrtFree(tmpDevice); + if (dstDevice != nullptr) + aclrtFree(dstDevice); + if (srcHost != nullptr) + aclrtFreeHost(srcHost); + if (tmpHost != nullptr) + aclrtFreeHost(tmpHost); + if (dstHost != nullptr) + aclrtFreeHost(dstHost); + + if (rc == 0) + std::printf("[INFO] case %s done\n", tc.name); + return rc; +} + +int main(int argc, char *argv[]) { + const char *caseFilter = (argc > 1) ? argv[1] : nullptr; + + int rc = 0; + int deviceId = 0; + aclrtStream stream = nullptr; + + aclInit(nullptr); + if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { + deviceId = std::atoi(envDevice); + } + aclrtSetDevice(deviceId); + aclrtCreateStream(&stream); + + for (size_t i = 0; i < kNumCases; ++i) { + if (caseFilter != nullptr && std::strcmp(kCases[i].name, caseFilter) != 0) { + continue; + } + int ret = RunCase(kCases[i], deviceId, stream); + if (ret != 0) { + std::fprintf(stderr, "[ERROR] case %s failed\n", kCases[i].name); + rc = 1; + break; + } + } + + if (stream != nullptr) + aclrtDestroyStream(stream); + aclrtResetDevice(deviceId); + aclFinalize(); + + return rc; +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolargmin/tcolargmin.pto b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolargmin/tcolargmin.pto new file mode 100644 index 000000000..8a2905642 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolargmin/tcolargmin.pto @@ -0,0 +1,115 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// TileLang ST kernels for pto.tcolargmin: tload(src) + tcolargmin(src, tmp, dst) + tstore(dst). +// Multiple cases with different shapes in a single module. +// Compiled by ptoas --enable-insert-sync --enable-tile-op-expand --pto-backend=vpto +// to produce a fatobj object. + +module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { +// Case 0: f32 1x256 (input: 1x256, tmp: 1x256, output: 1x256 indices) + func.func @TCOLARGMIN_f32_1x256(%dst_ptr: !pto.ptr, %tmp_ptr: !pto.ptr, %src_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c256 = arith.constant 256 : index + %c255 = arith.constant 255 : index + + %src_view = pto.make_tensor_view %src_ptr, + shape = [%c1, %c1, %c1, %c1, %c256], + strides = [%c256, %c256, %c256, %c256, %c1] + : !pto.tensor_view<1x1x1x1x256xf32> + %tmp_view = pto.make_tensor_view %tmp_ptr, + shape = [%c1, %c1, %c1, %c1, %c256], + strides = [%c256, %c256, %c256, %c256, %c1] + : !pto.tensor_view<1x1x1x1x256xf32> + %dst_view = pto.make_tensor_view %dst_ptr, + shape = [%c1, %c1, %c1, %c1, %c256], + strides = [%c256, %c256, %c256, %c256, %c1] + : !pto.tensor_view<1x1x1x1x256xi32> + + %src_part = pto.partition_view %src_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c1, %c255] + : !pto.tensor_view<1x1x1x1x256xf32> -> !pto.partition_tensor_view<1x1x1x1x255xf32> + %tmp_part = pto.partition_view %tmp_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c1, %c255] + : !pto.tensor_view<1x1x1x1x256xf32> -> !pto.partition_tensor_view<1x1x1x1x255xf32> + %dst_part = pto.partition_view %dst_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c1, %c255] + : !pto.tensor_view<1x1x1x1x256xi32> -> !pto.partition_tensor_view<1x1x1x1x255xi32> + + %src = pto.alloc_tile : !pto.tile_buf + %tmp = pto.alloc_tile : !pto.tile_buf + %dst = pto.alloc_tile : !pto.tile_buf + + pto.tload ins(%src_part : !pto.partition_tensor_view<1x1x1x1x255xf32>) + outs(%src : !pto.tile_buf) + + pto.tcolargmin ins(%src, %tmp : !pto.tile_buf, + !pto.tile_buf) + outs(%dst : !pto.tile_buf) + + pto.tstore ins(%dst : !pto.tile_buf) + outs(%dst_part : !pto.partition_tensor_view<1x1x1x1x255xi32>) + return + } + + // Case 1: f32 16x128 (input: 16x128, tmp: 16x128, output: 1x128 indices) + + func.func @TCOLARGMIN_f16_1x256(%dst_ptr: !pto.ptr, %tmp_ptr: !pto.ptr, %src_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c256 = arith.constant 256 : index + %c255 = arith.constant 255 : index + + %src_view = pto.make_tensor_view %src_ptr, + shape = [%c1, %c1, %c1, %c1, %c256], + strides = [%c256, %c256, %c256, %c256, %c1] + : !pto.tensor_view<1x1x1x1x256xf16> + %tmp_view = pto.make_tensor_view %tmp_ptr, + shape = [%c1, %c1, %c1, %c1, %c256], + strides = [%c256, %c256, %c256, %c256, %c1] + : !pto.tensor_view<1x1x1x1x256xf16> + %dst_view = pto.make_tensor_view %dst_ptr, + shape = [%c1, %c1, %c1, %c1, %c256], + strides = [%c256, %c256, %c256, %c256, %c1] + : !pto.tensor_view<1x1x1x1x256xi32> + + %src_part = pto.partition_view %src_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c1, %c255] + : !pto.tensor_view<1x1x1x1x256xf16> -> !pto.partition_tensor_view<1x1x1x1x255xf16> + %tmp_part = pto.partition_view %tmp_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c1, %c255] + : !pto.tensor_view<1x1x1x1x256xf16> -> !pto.partition_tensor_view<1x1x1x1x255xf16> + %dst_part = pto.partition_view %dst_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c1, %c255] + : !pto.tensor_view<1x1x1x1x256xi32> -> !pto.partition_tensor_view<1x1x1x1x255xi32> + + %src = pto.alloc_tile : !pto.tile_buf + %tmp = pto.alloc_tile : !pto.tile_buf + %dst = pto.alloc_tile : !pto.tile_buf + + pto.tload ins(%src_part : !pto.partition_tensor_view<1x1x1x1x255xf16>) + outs(%src : !pto.tile_buf) + + pto.tcolargmin ins(%src, %tmp : !pto.tile_buf, + !pto.tile_buf) + outs(%dst : !pto.tile_buf) + + pto.tstore ins(%dst : !pto.tile_buf) + outs(%dst_part : !pto.partition_tensor_view<1x1x1x1x255xi32>) + return + } + + // Case 4: f16 16x128 (input: 16x128, tmp: 16x128, output: 1x128 indices) +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpand/CMakeLists.txt b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpand/CMakeLists.txt new file mode 100644 index 000000000..6d4b3c0b3 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpand/CMakeLists.txt @@ -0,0 +1,16 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file in the compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +pto_tilelang_vec_st(tcolexpand) diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpand/cases.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpand/cases.py new file mode 100644 index 000000000..9d516db2d --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpand/cases.py @@ -0,0 +1,82 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +"""Single source of truth for tcolexpand ST test cases. +Matches PTO-ISA testcase definitions in /home/zhoushaofan/code/pto-isa/tests/npu/a5/src/st/testcase/tcolexpand/ + +TCOLEXPAND: expand src first row to dst all rows by broadcasting. + - src_shape: (src_row, cols) - input tile (only first row is used for broadcast) + - dst_shape: (dst_row, cols) - expanded output + - shape: (dst_row, cols) - alias of dst_shape, for compare.py compatibility + - valid_shape: (valid_row, valid_col) - effective computation region + +Case naming: {dtype}_{src_row}_{dst_row}_{cols}_{valid_col} +""" + +import numpy as np + +CASES = [ + { + "name": "float_1_8_128_63", + "dtype": np.float32, + "src_shape": (1, 128), + "shape": (8, 128), + "valid_shape": (8, 63), + "eps": 1e-6, + }, + { + "name": "half_1_16_512_512", + "dtype": np.float16, + "src_shape": (1, 512), + "shape": (16, 512), + "valid_shape": (16, 512), + "eps": 1e-3, + }, + { + "name": "int8_2_32_256_255", + "dtype": np.int8, + "src_shape": (2, 256), + "shape": (32, 256), + "valid_shape": (32, 255), + "eps": 0, + }, + { + "name": "half_1_33_512_512", + "dtype": np.float16, + "src_shape": (1, 512), + "shape": (33, 512), + "valid_shape": (33, 512), + "eps": 1e-3, + }, + { + "name": "int8_2_17_256_44", + "dtype": np.int8, + "src_shape": (2, 256), + "shape": (17, 256), + "valid_shape": (17, 44), + "eps": 0, + }, + { + "name": "float_1_54_64_63", + "dtype": np.float32, + "src_shape": (1, 64), + "shape": (54, 64), + "valid_shape": (54, 63), + "eps": 1e-6, + }, +] + +_SMOKE_CASE_NAMES = ['float_1_8_128_63', 'int8_2_17_256_44'] +_SMOKE_CASE_NAME_SET = set(_SMOKE_CASE_NAMES) +_missing = [name for name in _SMOKE_CASE_NAMES if name not in {case["name"] for case in CASES}] +if _missing: + raise RuntimeError("unknown smoke case(s): " + ", ".join(_missing)) +CASES = [case for case in CASES if case["name"] in _SMOKE_CASE_NAME_SET] diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpand/compare.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpand/compare.py new file mode 100644 index 000000000..b8ddb1131 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpand/compare.py @@ -0,0 +1,56 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You can not use this file in the compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import os +import sys +import numpy as np + +from cases import CASES +from st_common import result_cmp, style_fail, style_pass, validate_cases + + +def main(): + validate_cases(CASES) + case_filter = sys.argv[1] if len(sys.argv) > 1 else None + + all_passed = True + for case in CASES: + if case_filter is not None and case["name"] != case_filter: + continue + + case_dir = case["name"] + shape = case["shape"] + vr, vc = case["valid_shape"] + + golden = np.fromfile(os.path.join(case_dir, "golden.bin"), dtype=case["dtype"]).reshape(shape) + output = np.fromfile(os.path.join(case_dir, "output.bin"), dtype=case["dtype"]).reshape(shape) + + ok = result_cmp(golden[:vr, :vc], output[:vr, :vc], case["eps"]) + if ok: + print(style_pass(f"[INFO] {case['name']}: compare passed")) + else: + print(style_fail(f"[ERROR] {case['name']}: compare failed")) + all_passed = False + + if not all_passed: + sys.exit(2) + print(style_pass("[INFO] all cases passed")) + + +if __name__ == "__main__": + main() diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpand/gen_data.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpand/gen_data.py new file mode 100644 index 000000000..61397e0df --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpand/gen_data.py @@ -0,0 +1,34 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import numpy as np +from cases import CASES +from st_common import validate_cases, setup_case_rng, save_case_data + +validate_cases(CASES) + +for case in CASES: + setup_case_rng(case) + + dtype = case["dtype"] + src_shape = case["src_shape"] + dst_shape = case["shape"] + valid_shape = case["valid_shape"] + + src = np.random.randint(1, 10, size=src_shape).astype(dtype) + + golden = np.zeros(dst_shape, dtype=dtype) + valid_row, valid_col = valid_shape + for i in range(valid_row): + golden[i, :valid_col] = src[0, :valid_col] + + save_case_data(case["name"], {"input0": src, "golden": golden}) + print(f"[INFO] gen_data: {case['name']} src={src_shape} dst={dst_shape} valid={valid_shape} dtype={dtype.__name__}") diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpand/launch.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpand/launch.cpp new file mode 100644 index 000000000..a529b0bec --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpand/launch.cpp @@ -0,0 +1,28 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +#include + +#ifndef AICORE +#define AICORE [aicore] +#endif + +// Case 1: half_1_16_512_512 + +extern "C" __global__ AICORE void TCOLEXPAND_float_1_8_128_63(__gm__ float *src, __gm__ float *dst); +extern "C" __global__ AICORE void TCOLEXPAND_int8_2_17_256_44(__gm__ int8_t *src, __gm__ int8_t *dst); + +void LaunchTCOLEXPAND_int8_2_17_256_44(int8_t *src, int8_t *dst, void *stream) { + TCOLEXPAND_int8_2_17_256_44<<<1, nullptr, stream>>>((__gm__ int8_t *)src, (__gm__ int8_t *)dst); +} + + + +void LaunchTCOLEXPAND_float_1_8_128_63(float *src, float *dst, void *stream) { + TCOLEXPAND_float_1_8_128_63<<<1, nullptr, stream>>>((__gm__ float *)src, (__gm__ float *)dst); +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpand/main.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpand/main.cpp new file mode 100644 index 000000000..f52010788 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpand/main.cpp @@ -0,0 +1,138 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// Host driver for TileLang tcolexpand ST +// Test cases match PTO-ISA: /home/zhoushaofan/code/pto-isa/tests/npu/a5/src/st/testcase/tcolexpand/ +// TCOLEXPAND: expand src first row to dst all rows by broadcasting + +#include "acl/acl.h" +#include "test_common.h" +#include +#include +#include +#include +#include +#include + +using namespace PtoTestCommon; + +// Kernel launch wrappers (defined in launch.cpp) +void LaunchTCOLEXPAND_int8_2_32_256_255(int8_t *src, int8_t *dst, void *stream); +void LaunchTCOLEXPAND_float_1_8_128_63(float *src, float *dst, void *stream); +void LaunchTCOLEXPAND_half_1_33_512_512(uint16_t *src, uint16_t *dst, void *stream); +void LaunchTCOLEXPAND_int8_2_17_256_44(int8_t *src, int8_t *dst, void *stream); +void LaunchTCOLEXPAND_float_1_54_64_63(float *src, float *dst, void *stream); + +using LaunchFn = void (*)(void *, void *, void *); + +struct TestCase { + const char *name; + LaunchFn launch; + size_t srcRows; + size_t srcCols; + size_t dstRows; + size_t dstCols; + size_t validRows; + size_t validCols; + size_t elemSize; +}; + +static const TestCase kCases[] = { +{"float_1_8_128_63", (LaunchFn)LaunchTCOLEXPAND_float_1_8_128_63, 1, 128, 8, 128, 8, 63, sizeof(float)}, +{"int8_2_17_256_44", (LaunchFn)LaunchTCOLEXPAND_int8_2_17_256_44, 2, 256, 17, 256, 17, 44, sizeof(int8_t)}, +}; +static constexpr size_t kNumCases = sizeof(kCases) / sizeof(kCases[0]); + +static int RunCase(const TestCase &tc, int deviceId, aclrtStream stream) { + int rc = 0; + const size_t srcElemCount = tc.srcRows * tc.srcCols; + const size_t dstElemCount = tc.dstRows * tc.dstCols; + const size_t srcFileSize = srcElemCount * tc.elemSize; + const size_t dstFileSize = dstElemCount * tc.elemSize; + + std::printf("[INFO] === case: %s (src=%zux%zu -> dst=%zux%zu, valid=%zux%zu) ===\n", + tc.name, tc.srcRows, tc.srcCols, tc.dstRows, tc.dstCols, tc.validRows, tc.validCols); + + std::string caseDir = std::string("./") + tc.name; + size_t actualSrcFileSize = srcFileSize; + + void *srcHost = nullptr, *dstHost = nullptr; + void *srcDevice = nullptr, *dstDevice = nullptr; + + aclrtMallocHost((void **)(&srcHost), srcFileSize); + aclrtMallocHost((void **)(&dstHost), dstFileSize); + + aclrtMalloc((void **)&srcDevice, srcFileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc((void **)&dstDevice, dstFileSize, ACL_MEM_MALLOC_HUGE_FIRST); + + if (!ReadFile((caseDir + "/input0.bin").c_str(), actualSrcFileSize, srcHost, srcFileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input0.bin\n", caseDir.c_str()); + rc = 1; + } + + if (rc == 0) { + aclrtMemcpy(srcDevice, srcFileSize, srcHost, srcFileSize, ACL_MEMCPY_HOST_TO_DEVICE); + + tc.launch(srcDevice, dstDevice, stream); + + aclrtSynchronizeStream(stream); + aclrtMemcpy(dstHost, dstFileSize, dstDevice, dstFileSize, ACL_MEMCPY_DEVICE_TO_HOST); + } + + if (rc == 0 && !WriteFile((caseDir + "/output.bin").c_str(), dstHost, dstFileSize)) { + std::fprintf(stderr, "[ERROR] failed to write %s/output.bin\n", caseDir.c_str()); + rc = 1; + } + + if (srcDevice != nullptr) + aclrtFree(srcDevice); + if (dstDevice != nullptr) + aclrtFree(dstDevice); + if (srcHost != nullptr) + aclrtFreeHost(srcHost); + if (dstHost != nullptr) + aclrtFreeHost(dstHost); + + if (rc == 0) + std::printf("[INFO] case %s done\n", tc.name); + return rc; +} + +int main(int argc, char *argv[]) { + const char *caseFilter = (argc > 1) ? argv[1] : nullptr; + + int rc = 0; + int deviceId = 0; + aclrtStream stream = nullptr; + + aclInit(nullptr); + if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { + deviceId = std::atoi(envDevice); + } + aclrtSetDevice(deviceId); + aclrtCreateStream(&stream); + + for (size_t i = 0; i < kNumCases; ++i) { + if (caseFilter != nullptr && std::strcmp(kCases[i].name, caseFilter) != 0) { + continue; + } + int ret = RunCase(kCases[i], deviceId, stream); + if (ret != 0) { + std::fprintf(stderr, "[ERROR] case %s failed\n", kCases[i].name); + rc = 1; + break; + } + } + + if (stream != nullptr) + aclrtDestroyStream(stream); + aclrtResetDevice(deviceId); + aclFinalize(); + + return rc; +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpand/tcolexpand.pto b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpand/tcolexpand.pto new file mode 100644 index 000000000..6e246534a --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpand/tcolexpand.pto @@ -0,0 +1,110 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// TileLang ST kernels for pto.tcolexpand: expand src (src_row x valid_col) to dst (dst_row x valid_col). +// Matches PTO-ISA testcase parameters. +// Key: tile_buf cols = full tensor width, v_col = valid portion + +module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { + // Case 1: half_1_16_512_512 (fp16, valid_col=512, cols=512) + func.func @TCOLEXPAND_float_1_8_128_63(%src_ptr: !pto.ptr, %dst_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c8 = arith.constant 8 : index + %c63 = arith.constant 63 : index + %c128 = arith.constant 128 : index + %c1024 = arith.constant 1024 : index + + %src_view = pto.make_tensor_view %src_ptr, + shape = [%c1, %c1, %c1, %c1, %c128], + strides = [%c128, %c128, %c128, %c128, %c1] + : !pto.tensor_view<1x1x1x1x128xf32> + + %dst_view = pto.make_tensor_view %dst_ptr, + shape = [%c1, %c1, %c1, %c8, %c128], + strides = [%c1024, %c1024, %c1024, %c128, %c1] + : !pto.tensor_view<1x1x1x8x128xf32> + + %src_part = pto.partition_view %src_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c1, %c63] + : !pto.tensor_view<1x1x1x1x128xf32> -> !pto.partition_tensor_view<1x1x1x1x63xf32> + + %dst_part = pto.partition_view %dst_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c8, %c63] + : !pto.tensor_view<1x1x1x8x128xf32> -> !pto.partition_tensor_view<1x1x1x8x63xf32> + + %src = pto.alloc_tile + : !pto.tile_buf + + %dst = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%src_part : !pto.partition_tensor_view<1x1x1x1x63xf32>) + outs(%src : !pto.tile_buf) + + pto.tcolexpand ins(%src : !pto.tile_buf) + outs(%dst : !pto.tile_buf) + + pto.tstore ins(%dst : !pto.tile_buf) + outs(%dst_part : !pto.partition_tensor_view<1x1x1x8x63xf32>) + return + } + + // Case 4: half_1_33_512_512 (fp16, cols=512, valid_col=512) + + func.func @TCOLEXPAND_int8_2_17_256_44(%src_ptr: !pto.ptr, %dst_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c2 = arith.constant 2 : index + %c17 = arith.constant 17 : index + %c44 = arith.constant 44 : index + %c256 = arith.constant 256 : index + %c512 = arith.constant 512 : index + %c4352 = arith.constant 4352 : index + + %src_view = pto.make_tensor_view %src_ptr, + shape = [%c1, %c1, %c1, %c2, %c256], + strides = [%c512, %c512, %c512, %c256, %c1] + : !pto.tensor_view<1x1x1x2x256xi8> + + %dst_view = pto.make_tensor_view %dst_ptr, + shape = [%c1, %c1, %c1, %c17, %c256], + strides = [%c4352, %c4352, %c4352, %c256, %c1] + : !pto.tensor_view<1x1x1x17x256xi8> + + %src_part = pto.partition_view %src_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c2, %c44] + : !pto.tensor_view<1x1x1x2x256xi8> -> !pto.partition_tensor_view<1x1x1x2x44xi8> + + %dst_part = pto.partition_view %dst_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c17, %c44] + : !pto.tensor_view<1x1x1x17x256xi8> -> !pto.partition_tensor_view<1x1x1x17x44xi8> + + %src = pto.alloc_tile + : !pto.tile_buf + + %dst = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%src_part : !pto.partition_tensor_view<1x1x1x2x44xi8>) + outs(%src : !pto.tile_buf) + + pto.tcolexpand ins(%src : !pto.tile_buf) + outs(%dst : !pto.tile_buf) + + pto.tstore ins(%dst : !pto.tile_buf) + outs(%dst_part : !pto.partition_tensor_view<1x1x1x17x44xi8>) + return + } + + // Case 6: float_1_54_64_63 (float32, cols=64, valid_col=63) +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandadd/CMakeLists.txt b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandadd/CMakeLists.txt new file mode 100644 index 000000000..0ff96f2b1 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandadd/CMakeLists.txt @@ -0,0 +1,9 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +pto_tilelang_vec_st(tcolexpandadd) diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandadd/cases.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandadd/cases.py new file mode 100644 index 000000000..0cc0be849 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandadd/cases.py @@ -0,0 +1,84 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +"""Single source of truth for tcolexpandadd ST test cases. + +TCOLEXPANDADD: expand src1 then add with src0. + - src0_shape: (dst_row, dst_col) - already expanded (src0_shape = shape) + - src1_shape: (src1_row, src1_col) - to be expanded (usually src1_row=1) + - shape: (dst_row, dst_col) - output shape +""" + +import numpy as np + +CASES = [ + { + "name": "fp32_32_32_1_32", + "dtype": np.float32, + "src0_shape": (32, 32), + "src1_shape": (1, 32), + "shape": (32, 32), + "valid_shape": (32, 32), + "eps": 1e-3, + }, + { + "name": "fp32_16_128_1_128", + "dtype": np.float32, + "src0_shape": (16, 128), + "src1_shape": (1, 128), + "shape": (16, 128), + "valid_shape": (16, 128), + "eps": 1e-3, + }, + { + "name": "fp16_4_256_1_256", + "dtype": np.float16, + "src0_shape": (4, 256), + "src1_shape": (1, 256), + "shape": (4, 256), + "valid_shape": (4, 256), + "eps": 1e-3, + }, + { + "name": "fp16_10_64_1_64", + "dtype": np.float16, + "src0_shape": (10, 64), + "src1_shape": (1, 64), + "shape": (10, 64), + "valid_shape": (10, 64), + "eps": 1e-3, + }, + { + "name": "int32_16_32_1_32", + "dtype": np.int32, + "src0_shape": (16, 32), + "src1_shape": (1, 32), + "shape": (16, 32), + "valid_shape": (16, 32), + "eps": 0, + }, + { + "name": "int16_16_64_1_64", + "dtype": np.int16, + "src0_shape": (16, 64), + "src1_shape": (1, 64), + "shape": (16, 64), + "valid_shape": (16, 64), + "eps": 0, + }, +] + +_SMOKE_CASE_NAMES = ['fp16_10_64_1_64', 'int32_16_32_1_32'] +_SMOKE_CASE_NAME_SET = set(_SMOKE_CASE_NAMES) +_missing = [name for name in _SMOKE_CASE_NAMES if name not in {case["name"] for case in CASES}] +if _missing: + raise RuntimeError("unknown smoke case(s): " + ", ".join(_missing)) +CASES = [case for case in CASES if case["name"] in _SMOKE_CASE_NAME_SET] diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandadd/compare.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandadd/compare.py new file mode 100644 index 000000000..b8ddb1131 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandadd/compare.py @@ -0,0 +1,56 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You can not use this file in the compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import os +import sys +import numpy as np + +from cases import CASES +from st_common import result_cmp, style_fail, style_pass, validate_cases + + +def main(): + validate_cases(CASES) + case_filter = sys.argv[1] if len(sys.argv) > 1 else None + + all_passed = True + for case in CASES: + if case_filter is not None and case["name"] != case_filter: + continue + + case_dir = case["name"] + shape = case["shape"] + vr, vc = case["valid_shape"] + + golden = np.fromfile(os.path.join(case_dir, "golden.bin"), dtype=case["dtype"]).reshape(shape) + output = np.fromfile(os.path.join(case_dir, "output.bin"), dtype=case["dtype"]).reshape(shape) + + ok = result_cmp(golden[:vr, :vc], output[:vr, :vc], case["eps"]) + if ok: + print(style_pass(f"[INFO] {case['name']}: compare passed")) + else: + print(style_fail(f"[ERROR] {case['name']}: compare failed")) + all_passed = False + + if not all_passed: + sys.exit(2) + print(style_pass("[INFO] all cases passed")) + + +if __name__ == "__main__": + main() diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandadd/gen_data.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandadd/gen_data.py new file mode 100644 index 000000000..dafddf639 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandadd/gen_data.py @@ -0,0 +1,38 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import numpy as np +from cases import CASES +from st_common import validate_cases, setup_case_rng, save_case_data + +validate_cases(CASES) + +for case in CASES: + setup_case_rng(case) + + dtype = case["dtype"] + src0_shape = case["src0_shape"] + dst_shape = case["shape"] + src1_shape = case["src1_shape"] + valid_shape = case["valid_shape"] + + src0 = np.random.randint(1, 10, size=src0_shape).astype(dtype) + src1 = np.random.randint(1, 10, size=src1_shape).astype(dtype) + + golden = np.zeros(dst_shape, dtype=dtype) + valid_row, valid_col = valid_shape + src1_row, src1_col = src1_shape + reps = dst_shape[0] // src1_row + expanded_src1 = np.tile(src1, (reps, 1))[:, :valid_col] + golden[:valid_row, :valid_col] = (src0[:valid_row, :valid_col] + expanded_src1[:valid_row, :valid_col]).astype(dtype, copy=False) + + save_case_data(case["name"], {"input0": src0, "input1": src1, "golden": golden}) + print(f"[INFO] gen_data: {case['name']} src0={src0_shape} src1={src1_shape} dst={dst_shape} valid={valid_shape} dtype={dtype.__name__}") diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandadd/launch.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandadd/launch.cpp new file mode 100644 index 000000000..7a8e921c8 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandadd/launch.cpp @@ -0,0 +1,28 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +#include + +#ifndef AICORE +#define AICORE [aicore] +#endif + +// Case 1: fp32_16_128_1_128 + +extern "C" __global__ AICORE void TCOLEXPANDADD_fp16_10_64_1_64(__gm__ uint16_t *src0, __gm__ uint16_t *src1, __gm__ uint16_t *dst); +extern "C" __global__ AICORE void TCOLEXPANDADD_int32_16_32_1_32(__gm__ int32_t *src0, __gm__ int32_t *src1, __gm__ int32_t *dst); + +void LaunchTCOLEXPANDADD_int32_16_32_1_32(int32_t *src0, int32_t *src1, int32_t *dst, void *stream) { + TCOLEXPANDADD_int32_16_32_1_32<<<1, nullptr, stream>>>((__gm__ int32_t *)src0, (__gm__ int32_t *)src1, (__gm__ int32_t *)dst); +} + + + +void LaunchTCOLEXPANDADD_fp16_10_64_1_64(uint16_t *src0, uint16_t *src1, uint16_t *dst, void *stream) { + TCOLEXPANDADD_fp16_10_64_1_64<<<1, nullptr, stream>>>((__gm__ uint16_t *)src0, (__gm__ uint16_t *)src1, (__gm__ uint16_t *)dst); +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandadd/main.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandadd/main.cpp new file mode 100644 index 000000000..50abe0354 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandadd/main.cpp @@ -0,0 +1,152 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// Host driver for TileLang tcolexpandadd ST +// TCOLEXPANDADD: src0 + expand(src1) -> dst + +#include "acl/acl.h" +#include "test_common.h" +#include +#include +#include +#include +#include +#include + +using namespace PtoTestCommon; + +// Kernel launch wrappers (defined in launch.cpp) +void LaunchTCOLEXPANDADD_fp32_32_32_1_32(float *src0, float *src1, float *dst, void *stream); +void LaunchTCOLEXPANDADD_fp16_10_64_1_64(uint16_t *src0, uint16_t *src1, uint16_t *dst, void *stream); +void LaunchTCOLEXPANDADD_int32_16_32_1_32(int32_t *src0, int32_t *src1, int32_t *dst, void *stream); +void LaunchTCOLEXPANDADD_int16_16_64_1_64(int16_t *src0, int16_t *src1, int16_t *dst, void *stream); + +using LaunchFn = void (*)(void *, void *, void *, void *); + +struct TestCase { + const char *name; + LaunchFn launch; + size_t src0Rows; + size_t src0Cols; + size_t src1Rows; + size_t src1Cols; + size_t dstRows; + size_t dstCols; + size_t validRows; + size_t validCols; + size_t elemSize; +}; + +static const TestCase kCases[] = { +{"fp16_10_64_1_64", (LaunchFn)LaunchTCOLEXPANDADD_fp16_10_64_1_64, 10, 64, 1, 64, 10, 64, 10, 64, sizeof(uint16_t)}, +{"int32_16_32_1_32", (LaunchFn)LaunchTCOLEXPANDADD_int32_16_32_1_32, 16, 32, 1, 32, 16, 32, 16, 32, sizeof(int32_t)}, +}; +static constexpr size_t kNumCases = sizeof(kCases) / sizeof(kCases[0]); + +static int RunCase(const TestCase &tc, int deviceId, aclrtStream stream) { + int rc = 0; + const size_t src0ElemCount = tc.src0Rows * tc.src0Cols; + const size_t src1ElemCount = tc.src1Rows * tc.src1Cols; + const size_t dstElemCount = tc.dstRows * tc.dstCols; + const size_t src0FileSize = src0ElemCount * tc.elemSize; + const size_t src1FileSize = src1ElemCount * tc.elemSize; + const size_t dstFileSize = dstElemCount * tc.elemSize; + + std::printf("[INFO] === case: %s (src0=%zux%zu, src1=%zux%zu -> dst=%zux%zu) ===\n", + tc.name, tc.src0Rows, tc.src0Cols, tc.src1Rows, tc.src1Cols, tc.dstRows, tc.dstCols); + + std::string caseDir = std::string("./") + tc.name; + size_t actualSrc0FileSize = src0FileSize; + size_t actualSrc1FileSize = src1FileSize; + + void *src0Host = nullptr, *src1Host = nullptr, *dstHost = nullptr; + void *src0Device = nullptr, *src1Device = nullptr, *dstDevice = nullptr; + + aclrtMallocHost((void **)(&src0Host), src0FileSize); + aclrtMallocHost((void **)(&src1Host), src1FileSize); + aclrtMallocHost((void **)(&dstHost), dstFileSize); + + aclrtMalloc((void **)&src0Device, src0FileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc((void **)&src1Device, src1FileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc((void **)&dstDevice, dstFileSize, ACL_MEM_MALLOC_HUGE_FIRST); + + if (!ReadFile((caseDir + "/input0.bin").c_str(), actualSrc0FileSize, src0Host, src0FileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input0.bin\n", caseDir.c_str()); + rc = 1; + } + if (rc == 0 && !ReadFile((caseDir + "/input1.bin").c_str(), actualSrc1FileSize, src1Host, src1FileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input1.bin\n", caseDir.c_str()); + rc = 1; + } + + if (rc == 0) { + aclrtMemcpy(src0Device, src0FileSize, src0Host, src0FileSize, ACL_MEMCPY_HOST_TO_DEVICE); + aclrtMemcpy(src1Device, src1FileSize, src1Host, src1FileSize, ACL_MEMCPY_HOST_TO_DEVICE); + + tc.launch(src0Device, src1Device, dstDevice, stream); + + aclrtSynchronizeStream(stream); + aclrtMemcpy(dstHost, dstFileSize, dstDevice, dstFileSize, ACL_MEMCPY_DEVICE_TO_HOST); + } + + if (rc == 0 && !WriteFile((caseDir + "/output.bin").c_str(), dstHost, dstFileSize)) { + std::fprintf(stderr, "[ERROR] failed to write %s/output.bin\n", caseDir.c_str()); + rc = 1; + } + + if (src0Device != nullptr) + aclrtFree(src0Device); + if (src1Device != nullptr) + aclrtFree(src1Device); + if (dstDevice != nullptr) + aclrtFree(dstDevice); + if (src0Host != nullptr) + aclrtFreeHost(src0Host); + if (src1Host != nullptr) + aclrtFreeHost(src1Host); + if (dstHost != nullptr) + aclrtFreeHost(dstHost); + + if (rc == 0) + std::printf("[INFO] case %s done\n", tc.name); + return rc; +} + +int main(int argc, char *argv[]) { + const char *caseFilter = (argc > 1) ? argv[1] : nullptr; + + int rc = 0; + int deviceId = 0; + aclrtStream stream = nullptr; + + aclInit(nullptr); + if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { + deviceId = std::atoi(envDevice); + } + aclrtSetDevice(deviceId); + aclrtCreateStream(&stream); + + for (size_t i = 0; i < kNumCases; ++i) { + if (caseFilter != nullptr && std::strcmp(kCases[i].name, caseFilter) != 0) { + continue; + } + int ret = RunCase(kCases[i], deviceId, stream); + if (ret != 0) { + std::fprintf(stderr, "[ERROR] case %s failed\n", kCases[i].name); + rc = 1; + break; + } + } + + if (stream != nullptr) + aclrtDestroyStream(stream); + aclrtResetDevice(deviceId); + aclFinalize(); + + return rc; +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandadd/tcolexpandadd.pto b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandadd/tcolexpandadd.pto new file mode 100644 index 000000000..2d8c72898 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandadd/tcolexpandadd.pto @@ -0,0 +1,142 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// TileLang ST kernels for pto.tcolexpandadd: expand src1 and add with src0. +// TCOLEXPANDADD: src0 + expand(src1) -> dst +// - src0: (dst_row, dst_col) +// - src1: (src1_row, src1_col), usually src1_row=1 +// - dst: (dst_row, dst_col) + +module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { + // Case 1: fp32_16_128_1_128 (float32, src0=(16,128), src1=(1,128), dst=(16,128)) + func.func @TCOLEXPANDADD_fp16_10_64_1_64(%src0_ptr: !pto.ptr, %src1_ptr: !pto.ptr, %dst_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c10 = arith.constant 10 : index + %c64 = arith.constant 64 : index + %c640 = arith.constant 640 : index + + %src0_view = pto.make_tensor_view %src0_ptr, + shape = [%c1, %c1, %c1, %c10, %c64], + strides = [%c640, %c640, %c640, %c64, %c1] + : !pto.tensor_view<1x1x1x10x64xf16> + + %src1_view = pto.make_tensor_view %src1_ptr, + shape = [%c1, %c1, %c1, %c1, %c64], + strides = [%c64, %c64, %c64, %c64, %c1] + : !pto.tensor_view<1x1x1x1x64xf16> + + %dst_view = pto.make_tensor_view %dst_ptr, + shape = [%c1, %c1, %c1, %c10, %c64], + strides = [%c640, %c640, %c640, %c64, %c1] + : !pto.tensor_view<1x1x1x10x64xf16> + + %src0_part = pto.partition_view %src0_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c10, %c64] + : !pto.tensor_view<1x1x1x10x64xf16> -> !pto.partition_tensor_view<1x1x1x10x64xf16> + + %src1_part = pto.partition_view %src1_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c1, %c64] + : !pto.tensor_view<1x1x1x1x64xf16> -> !pto.partition_tensor_view<1x1x1x1x64xf16> + + %dst_part = pto.partition_view %dst_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c10, %c64] + : !pto.tensor_view<1x1x1x10x64xf16> -> !pto.partition_tensor_view<1x1x1x10x64xf16> + + %src0_tile = pto.alloc_tile + : !pto.tile_buf + + %src1_tile = pto.alloc_tile + : !pto.tile_buf + + %dst_tile = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%src0_part : !pto.partition_tensor_view<1x1x1x10x64xf16>) + outs(%src0_tile : !pto.tile_buf) + + pto.tload ins(%src1_part : !pto.partition_tensor_view<1x1x1x1x64xf16>) + outs(%src1_tile : !pto.tile_buf) + + pto.tcolexpandadd ins(%src0_tile, %src1_tile : !pto.tile_buf, + !pto.tile_buf) + outs(%dst_tile : !pto.tile_buf) + + pto.tstore ins(%dst_tile : !pto.tile_buf) + outs(%dst_part : !pto.partition_tensor_view<1x1x1x10x64xf16>) + return + } + + // Case 5: int32_16_32_1_32 (int32, src0=(16,32), src1=(1,32), dst=(16,32)) + + func.func @TCOLEXPANDADD_int32_16_32_1_32(%src0_ptr: !pto.ptr, %src1_ptr: !pto.ptr, %dst_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c16 = arith.constant 16 : index + %c32 = arith.constant 32 : index + %c512 = arith.constant 512 : index + + %src0_view = pto.make_tensor_view %src0_ptr, + shape = [%c1, %c1, %c1, %c16, %c32], + strides = [%c512, %c512, %c512, %c32, %c1] + : !pto.tensor_view<1x1x1x16x32xi32> + + %src1_view = pto.make_tensor_view %src1_ptr, + shape = [%c1, %c1, %c1, %c1, %c32], + strides = [%c32, %c32, %c32, %c32, %c1] + : !pto.tensor_view<1x1x1x1x32xi32> + + %dst_view = pto.make_tensor_view %dst_ptr, + shape = [%c1, %c1, %c1, %c16, %c32], + strides = [%c512, %c512, %c512, %c32, %c1] + : !pto.tensor_view<1x1x1x16x32xi32> + + %src0_part = pto.partition_view %src0_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c16, %c32] + : !pto.tensor_view<1x1x1x16x32xi32> -> !pto.partition_tensor_view<1x1x1x16x32xi32> + + %src1_part = pto.partition_view %src1_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c1, %c32] + : !pto.tensor_view<1x1x1x1x32xi32> -> !pto.partition_tensor_view<1x1x1x1x32xi32> + + %dst_part = pto.partition_view %dst_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c16, %c32] + : !pto.tensor_view<1x1x1x16x32xi32> -> !pto.partition_tensor_view<1x1x1x16x32xi32> + + %src0_tile = pto.alloc_tile + : !pto.tile_buf + + %src1_tile = pto.alloc_tile + : !pto.tile_buf + + %dst_tile = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%src0_part : !pto.partition_tensor_view<1x1x1x16x32xi32>) + outs(%src0_tile : !pto.tile_buf) + + pto.tload ins(%src1_part : !pto.partition_tensor_view<1x1x1x1x32xi32>) + outs(%src1_tile : !pto.tile_buf) + + pto.tcolexpandadd ins(%src0_tile, %src1_tile : !pto.tile_buf, + !pto.tile_buf) + outs(%dst_tile : !pto.tile_buf) + + pto.tstore ins(%dst_tile : !pto.tile_buf) + outs(%dst_part : !pto.partition_tensor_view<1x1x1x16x32xi32>) + return + } + + // Case 6: int16_16_64_1_64 (int16, src0=(16,64), src1=(1,64), dst=(16,64)) +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpanddiv/CMakeLists.txt b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpanddiv/CMakeLists.txt new file mode 100644 index 000000000..2f64c395b --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpanddiv/CMakeLists.txt @@ -0,0 +1,9 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +pto_tilelang_vec_st(tcolexpanddiv) diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpanddiv/cases.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpanddiv/cases.py new file mode 100644 index 000000000..554fde09a --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpanddiv/cases.py @@ -0,0 +1,122 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file in the compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +"""Single source of truth for tcolexpanddiv ST test cases. +Matches PTO-ISA testcase definitions in /home/zhoushaofan/code/pto-isa/tests/npu/a5/src/st/testcase/tcolexpanddiv/ + +TCOLEXPANDDIV: column-wise broadcast divide - dst[i,j] = src0[i,j] / src1[0,j] + - src0_shape: (src0_row, cols) - dividend input tile + - src1_shape: (1, cols) - divisor input tile (single row, broadcast) + - dst_shape: (dst_row, cols) - output tile + - valid_shape: (valid_row, valid_col) - effective computation region + +Case naming: {dtype}_{src0_row}_{src0_col}_{src1_row}_{src1_col} +""" + +import numpy as np + +CASES = [ + { + "name": "fp32_32_64_1_64", + "dtype": np.float32, + "src0_shape": (32, 64), + "src1_shape": (1, 64), + "shape": (32, 64), + "valid_shape": (32, 64), + "eps": 1e-6, + }, + { + "name": "fp32_8_32_1_32", + "dtype": np.float32, + "src0_shape": (8, 32), + "src1_shape": (1, 32), + "shape": (8, 32), + "valid_shape": (8, 32), + "eps": 1e-6, + }, + { + "name": "fp16_16_64_1_64", + "dtype": np.float16, + "src0_shape": (16, 64), + "src1_shape": (1, 64), + "shape": (16, 64), + "valid_shape": (16, 64), + "eps": 1e-3, + }, + { + "name": "fp16_4_128_1_128", + "dtype": np.float16, + "src0_shape": (4, 128), + "src1_shape": (1, 128), + "shape": (4, 128), + "valid_shape": (4, 128), + "eps": 1e-3, + }, + { + "name": "int32_16_32_1_32", + "dtype": np.int32, + "src0_shape": (16, 32), + "src1_shape": (1, 32), + "shape": (16, 32), + "valid_shape": (16, 32), + "eps": 0, + }, + { + "name": "int16_16_64_1_64", + "dtype": np.int16, + "src0_shape": (16, 64), + "src1_shape": (1, 64), + "shape": (16, 64), + "valid_shape": (16, 64), + "eps": 0, + }, + { + "name": "fp32_40_32_1_32", + "dtype": np.float32, + "src0_shape": (40, 32), + "src1_shape": (1, 32), + "shape": (40, 32), + "valid_shape": (40, 32), + "eps": 1e-6, + }, + { + "name": "fp16_16_128_1_128", + "dtype": np.float16, + "src0_shape": (16, 128), + "src1_shape": (1, 128), + "shape": (16, 128), + "valid_shape": (16, 128), + "eps": 1e-3, + }, + { + "name": "fp32_20_64_1_64", + "dtype": np.float32, + "src0_shape": (20, 64), + "src1_shape": (1, 64), + "shape": (20, 64), + "valid_shape": (20, 64), + "eps": 1e-6, + }, +] + +_SMOKE_CASE_NAMES = ['fp32_8_32_1_32', 'int32_16_32_1_32'] +_SMOKE_CASE_NAME_SET = set(_SMOKE_CASE_NAMES) +_missing = [name for name in _SMOKE_CASE_NAMES if name not in {case["name"] for case in CASES}] +if _missing: + raise RuntimeError("unknown smoke case(s): " + ", ".join(_missing)) +CASES = [case for case in CASES if case["name"] in _SMOKE_CASE_NAME_SET] diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpanddiv/compare.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpanddiv/compare.py new file mode 100644 index 000000000..b8ddb1131 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpanddiv/compare.py @@ -0,0 +1,56 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You can not use this file in the compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import os +import sys +import numpy as np + +from cases import CASES +from st_common import result_cmp, style_fail, style_pass, validate_cases + + +def main(): + validate_cases(CASES) + case_filter = sys.argv[1] if len(sys.argv) > 1 else None + + all_passed = True + for case in CASES: + if case_filter is not None and case["name"] != case_filter: + continue + + case_dir = case["name"] + shape = case["shape"] + vr, vc = case["valid_shape"] + + golden = np.fromfile(os.path.join(case_dir, "golden.bin"), dtype=case["dtype"]).reshape(shape) + output = np.fromfile(os.path.join(case_dir, "output.bin"), dtype=case["dtype"]).reshape(shape) + + ok = result_cmp(golden[:vr, :vc], output[:vr, :vc], case["eps"]) + if ok: + print(style_pass(f"[INFO] {case['name']}: compare passed")) + else: + print(style_fail(f"[ERROR] {case['name']}: compare failed")) + all_passed = False + + if not all_passed: + sys.exit(2) + print(style_pass("[INFO] all cases passed")) + + +if __name__ == "__main__": + main() diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpanddiv/gen_data.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpanddiv/gen_data.py new file mode 100644 index 000000000..8fb0486f0 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpanddiv/gen_data.py @@ -0,0 +1,38 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import numpy as np +from cases import CASES +from st_common import validate_cases, setup_case_rng, save_case_data + +validate_cases(CASES) + +for case in CASES: + setup_case_rng(case) + + dtype = case["dtype"] + src0_shape = case["src0_shape"] + dst_shape = case["shape"] + src1_shape = case["src1_shape"] + valid_shape = case["valid_shape"] + + src0 = np.random.uniform(1.0, 10.0, size=src0_shape).astype(dtype) + src1 = np.random.uniform(1.0, 10.0, size=src1_shape).astype(dtype) + + valid_row, valid_col = valid_shape + reps = dst_shape[0] // src1_shape[0] + + golden = np.zeros(dst_shape, dtype=dtype) + expanded_src1 = np.tile(src1, (reps, 1))[:, :valid_col] + golden[:valid_row, :valid_col] = src0[:valid_row, :valid_col] / expanded_src1 + + save_case_data(case["name"], {"input0": src0, "input1": src1, "golden": golden}) + print(f"[INFO] gen_data: {case['name']} src0={src0_shape} src1={src1_shape} dst={dst_shape} valid={valid_shape} dtype={dtype.__name__}") diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpanddiv/launch.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpanddiv/launch.cpp new file mode 100644 index 000000000..90c5d6332 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpanddiv/launch.cpp @@ -0,0 +1,28 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +#include + +#ifndef AICORE +#define AICORE [aicore] +#endif + +// Case 1: fp32_32_64_1_64 + +extern "C" __global__ AICORE void TCOLEXPANDDIV_fp32_8_32_1_32(__gm__ float *src0, __gm__ float *src1, __gm__ float *dst); +extern "C" __global__ AICORE void TCOLEXPANDDIV_int32_16_32_1_32(__gm__ int32_t *src0, __gm__ int32_t *src1, __gm__ int32_t *dst); + +void LaunchTCOLEXPANDDIV_fp32_8_32_1_32(float *src0, float *src1, float *dst, void *stream) { + TCOLEXPANDDIV_fp32_8_32_1_32<<<1, nullptr, stream>>>((__gm__ float *)src0, (__gm__ float *)src1, (__gm__ float *)dst); +} + + + +void LaunchTCOLEXPANDDIV_int32_16_32_1_32(int32_t *src0, int32_t *src1, int32_t *dst, void *stream) { + TCOLEXPANDDIV_int32_16_32_1_32<<<1, nullptr, stream>>>((__gm__ int32_t *)src0, (__gm__ int32_t *)src1, (__gm__ int32_t *)dst); +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpanddiv/main.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpanddiv/main.cpp new file mode 100644 index 000000000..0516cd07f --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpanddiv/main.cpp @@ -0,0 +1,156 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// Host driver for TileLang tcolexpanddiv ST +// Test cases match PTO-ISA: /home/zhoushaofan/code/pto-isa/tests/npu/a5/src/st/testcase/tcolexpanddiv/ +// TCOLEXPANDDIV: column-wise broadcast divide - dst[i,j] = src0[i,j] / src1[0,j] + +#include "acl/acl.h" +#include "test_common.h" +#include +#include +#include +#include +#include +#include + +using namespace PtoTestCommon; + +// Kernel launch wrappers (defined in launch.cpp) +void LaunchTCOLEXPANDDIV_fp32_8_32_1_32(float *src0, float *src1, float *dst, void *stream); +void LaunchTCOLEXPANDDIV_fp16_4_128_1_128(uint16_t *src0, uint16_t *src1, uint16_t *dst, void *stream); +void LaunchTCOLEXPANDDIV_int32_16_32_1_32(int32_t *src0, int32_t *src1, int32_t *dst, void *stream); +void LaunchTCOLEXPANDDIV_int16_16_64_1_64(int16_t *src0, int16_t *src1, int16_t *dst, void *stream); +void LaunchTCOLEXPANDDIV_fp16_16_128_1_128(uint16_t *src0, uint16_t *src1, uint16_t *dst, void *stream); + +using LaunchFn = void (*)(void *, void *, void *, void *); + +struct TestCase { + const char *name; + LaunchFn launch; + size_t src0Rows; + size_t src0Cols; + size_t src1Rows; + size_t src1Cols; + size_t dstRows; + size_t dstCols; + size_t validRows; + size_t validCols; + size_t elemSize; +}; + +static const TestCase kCases[] = { +{"fp32_8_32_1_32", (LaunchFn)LaunchTCOLEXPANDDIV_fp32_8_32_1_32, 8, 32, 1, 32, 8, 32, 8, 32, sizeof(float)}, +{"int32_16_32_1_32", (LaunchFn)LaunchTCOLEXPANDDIV_int32_16_32_1_32, 16, 32, 1, 32, 16, 32, 16, 32, sizeof(int32_t)}, +}; +static constexpr size_t kNumCases = sizeof(kCases) / sizeof(kCases[0]); + +static int RunCase(const TestCase &tc, int deviceId, aclrtStream stream) { + int rc = 0; + const size_t src0ElemCount = tc.src0Rows * tc.src0Cols; + const size_t src1ElemCount = tc.src1Rows * tc.src1Cols; + const size_t dstElemCount = tc.dstRows * tc.dstCols; + const size_t src0FileSize = src0ElemCount * tc.elemSize; + const size_t src1FileSize = src1ElemCount * tc.elemSize; + const size_t dstFileSize = dstElemCount * tc.elemSize; + + std::printf("[INFO] === case: %s (src0=%zux%zu, src1=%zux%zu -> dst=%zux%zu, valid=%zux%zu) ===\n", + tc.name, tc.src0Rows, tc.src0Cols, tc.src1Rows, tc.src1Cols, + tc.dstRows, tc.dstCols, tc.validRows, tc.validCols); + + std::string caseDir = std::string("./") + tc.name; + size_t actualSrc0FileSize = src0FileSize; + size_t actualSrc1FileSize = src1FileSize; + + void *src0Host = nullptr, *src1Host = nullptr, *dstHost = nullptr; + void *src0Device = nullptr, *src1Device = nullptr, *dstDevice = nullptr; + + aclrtMallocHost((void **)(&src0Host), src0FileSize); + aclrtMallocHost((void **)(&src1Host), src1FileSize); + aclrtMallocHost((void **)(&dstHost), dstFileSize); + + aclrtMalloc((void **)&src0Device, src0FileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc((void **)&src1Device, src1FileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc((void **)&dstDevice, dstFileSize, ACL_MEM_MALLOC_HUGE_FIRST); + + if (!ReadFile((caseDir + "/input0.bin").c_str(), actualSrc0FileSize, src0Host, src0FileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input0.bin\n", caseDir.c_str()); + rc = 1; + } + + if (rc == 0 && !ReadFile((caseDir + "/input1.bin").c_str(), actualSrc1FileSize, src1Host, src1FileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input1.bin\n", caseDir.c_str()); + rc = 1; + } + + if (rc == 0) { + aclrtMemcpy(src0Device, src0FileSize, src0Host, src0FileSize, ACL_MEMCPY_HOST_TO_DEVICE); + aclrtMemcpy(src1Device, src1FileSize, src1Host, src1FileSize, ACL_MEMCPY_HOST_TO_DEVICE); + + tc.launch(src0Device, src1Device, dstDevice, stream); + + aclrtSynchronizeStream(stream); + aclrtMemcpy(dstHost, dstFileSize, dstDevice, dstFileSize, ACL_MEMCPY_DEVICE_TO_HOST); + } + + if (rc == 0 && !WriteFile((caseDir + "/output.bin").c_str(), dstHost, dstFileSize)) { + std::fprintf(stderr, "[ERROR] failed to write %s/output.bin\n", caseDir.c_str()); + rc = 1; + } + + if (src0Device != nullptr) + aclrtFree(src0Device); + if (src1Device != nullptr) + aclrtFree(src1Device); + if (dstDevice != nullptr) + aclrtFree(dstDevice); + if (src0Host != nullptr) + aclrtFreeHost(src0Host); + if (src1Host != nullptr) + aclrtFreeHost(src1Host); + if (dstHost != nullptr) + aclrtFreeHost(dstHost); + + if (rc == 0) + std::printf("[INFO] case %s done\n", tc.name); + return rc; +} + +int main(int argc, char *argv[]) { + const char *caseFilter = (argc > 1) ? argv[1] : nullptr; + + int rc = 0; + int deviceId = 0; + aclrtStream stream = nullptr; + + aclInit(nullptr); + if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { + deviceId = std::atoi(envDevice); + } + aclrtSetDevice(deviceId); + aclrtCreateStream(&stream); + + for (size_t i = 0; i < kNumCases; ++i) { + if (caseFilter != nullptr && std::strcmp(kCases[i].name, caseFilter) != 0) { + continue; + } + int ret = RunCase(kCases[i], deviceId, stream); + if (ret != 0) { + std::fprintf(stderr, "[ERROR] case %s failed\n", kCases[i].name); + rc = 1; + break; + } + } + + if (stream != nullptr) + aclrtDestroyStream(stream); + aclrtResetDevice(deviceId); + aclFinalize(); + + return rc; +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpanddiv/tcolexpanddiv.pto b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpanddiv/tcolexpanddiv.pto new file mode 100644 index 000000000..80dc2792a --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpanddiv/tcolexpanddiv.pto @@ -0,0 +1,140 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// TileLang ST kernels for pto.tcolexpanddiv: column-wise broadcast divide. +// Matches PTO-ISA testcase parameters. +// Key: tile_buf cols = full tensor width, v_col = valid portion + +module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { + // Case 1: fp32_32_64_1_64 (float32, src0=(32,64), src1=(1,64), dst=(32,64)) + func.func @TCOLEXPANDDIV_fp32_8_32_1_32(%src0_ptr: !pto.ptr, %src1_ptr: !pto.ptr, %dst_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c8 = arith.constant 8 : index + %c32 = arith.constant 32 : index + %c256 = arith.constant 256 : index + + %src0_view = pto.make_tensor_view %src0_ptr, + shape = [%c1, %c1, %c1, %c8, %c32], + strides = [%c256, %c256, %c256, %c32, %c1] + : !pto.tensor_view<1x1x1x8x32xf32> + + %src1_view = pto.make_tensor_view %src1_ptr, + shape = [%c1, %c1, %c1, %c1, %c32], + strides = [%c32, %c32, %c32, %c32, %c1] + : !pto.tensor_view<1x1x1x1x32xf32> + + %dst_view = pto.make_tensor_view %dst_ptr, + shape = [%c1, %c1, %c1, %c8, %c32], + strides = [%c256, %c256, %c256, %c32, %c1] + : !pto.tensor_view<1x1x1x8x32xf32> + + %src0_part = pto.partition_view %src0_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c8, %c32] + : !pto.tensor_view<1x1x1x8x32xf32> -> !pto.partition_tensor_view<1x1x1x8x32xf32> + + %src1_part = pto.partition_view %src1_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c1, %c32] + : !pto.tensor_view<1x1x1x1x32xf32> -> !pto.partition_tensor_view<1x1x1x1x32xf32> + + %dst_part = pto.partition_view %dst_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c8, %c32] + : !pto.tensor_view<1x1x1x8x32xf32> -> !pto.partition_tensor_view<1x1x1x8x32xf32> + + %src0 = pto.alloc_tile + : !pto.tile_buf + + %src1 = pto.alloc_tile + : !pto.tile_buf + + %dst = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%src0_part : !pto.partition_tensor_view<1x1x1x8x32xf32>) + outs(%src0 : !pto.tile_buf) + + pto.tload ins(%src1_part : !pto.partition_tensor_view<1x1x1x1x32xf32>) + outs(%src1 : !pto.tile_buf) + + pto.tcolexpanddiv ins(%src0, %src1 : !pto.tile_buf, + !pto.tile_buf) + outs(%dst : !pto.tile_buf) + + pto.tstore ins(%dst : !pto.tile_buf) + outs(%dst_part : !pto.partition_tensor_view<1x1x1x8x32xf32>) + return + } + + // Case 3: fp16_16_64_1_64 (float16, src0=(16,64), src1=(1,64), dst=(16,64)) + + func.func @TCOLEXPANDDIV_int32_16_32_1_32(%src0_ptr: !pto.ptr, %src1_ptr: !pto.ptr, %dst_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c16 = arith.constant 16 : index + %c32 = arith.constant 32 : index + %c512 = arith.constant 512 : index + + %src0_view = pto.make_tensor_view %src0_ptr, + shape = [%c1, %c1, %c1, %c16, %c32], + strides = [%c512, %c512, %c512, %c32, %c1] + : !pto.tensor_view<1x1x1x16x32xi32> + + %src1_view = pto.make_tensor_view %src1_ptr, + shape = [%c1, %c1, %c1, %c1, %c32], + strides = [%c32, %c32, %c32, %c32, %c1] + : !pto.tensor_view<1x1x1x1x32xi32> + + %dst_view = pto.make_tensor_view %dst_ptr, + shape = [%c1, %c1, %c1, %c16, %c32], + strides = [%c512, %c512, %c512, %c32, %c1] + : !pto.tensor_view<1x1x1x16x32xi32> + + %src0_part = pto.partition_view %src0_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c16, %c32] + : !pto.tensor_view<1x1x1x16x32xi32> -> !pto.partition_tensor_view<1x1x1x16x32xi32> + + %src1_part = pto.partition_view %src1_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c1, %c32] + : !pto.tensor_view<1x1x1x1x32xi32> -> !pto.partition_tensor_view<1x1x1x1x32xi32> + + %dst_part = pto.partition_view %dst_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c16, %c32] + : !pto.tensor_view<1x1x1x16x32xi32> -> !pto.partition_tensor_view<1x1x1x16x32xi32> + + %src0 = pto.alloc_tile + : !pto.tile_buf + + %src1 = pto.alloc_tile + : !pto.tile_buf + + %dst = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%src0_part : !pto.partition_tensor_view<1x1x1x16x32xi32>) + outs(%src0 : !pto.tile_buf) + + pto.tload ins(%src1_part : !pto.partition_tensor_view<1x1x1x1x32xi32>) + outs(%src1 : !pto.tile_buf) + + pto.tcolexpanddiv ins(%src0, %src1 : !pto.tile_buf, + !pto.tile_buf) + outs(%dst : !pto.tile_buf) + + pto.tstore ins(%dst : !pto.tile_buf) + outs(%dst_part : !pto.partition_tensor_view<1x1x1x16x32xi32>) + return + } + + // Case 6: int16_16_64_1_64 (int16, src0=(16,64), src1=(1,64), dst=(16,64)) +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandexpdif/CMakeLists.txt b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandexpdif/CMakeLists.txt new file mode 100644 index 000000000..75e520a81 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandexpdif/CMakeLists.txt @@ -0,0 +1,9 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +pto_tilelang_vec_st(tcolexpandexpdif) diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandexpdif/cases.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandexpdif/cases.py new file mode 100644 index 000000000..a5268c679 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandexpdif/cases.py @@ -0,0 +1,74 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +"""Single source of truth for tcolexpandexpdif ST test cases. +Matches PTO-ISA testcase definitions in /home/zhoushaofan/code/pto-isa/tests/npu/a5/src/st/testcase/tcolexpandexpdif/ + +TCOLEXPANDEXPDIF: compute exp(src0) - exp(expanded_src1) where src1 is expanded by tiling. + - src0_shape: (src0_row, cols) - first input tile + - src1_shape: (src1_row, cols) - second input tile (tiled to match src0 rows) + - dst_shape: (dst_row, dst_col) - output tile + - shape: (dst_row, dst_col) - alias of dst_shape, for compare.py compatibility + - valid_shape: (valid_row, valid_col) - effective computation region + +Golden: np.exp(src0) - np.exp(np.tile(src1, (reps, 1))[:, :dst_col]) + where reps = dst_row // src1_row + +Case naming: {dtype}_{src0_row}_{src0_col}_{src1_row}_{src1_col} +""" + +import numpy as np + +CASES = [ + { + "name": "fp32_32_16_1_16", + "dtype": np.float32, + "src0_shape": (32, 16), + "src1_shape": (1, 16), + "shape": (32, 16), + "valid_shape": (32, 16), + "eps": 1e-5, + }, + { + "name": "fp32_16_32_1_32", + "dtype": np.float32, + "src0_shape": (16, 32), + "src1_shape": (1, 32), + "shape": (16, 32), + "valid_shape": (16, 32), + "eps": 1e-5, + }, + { + "name": "fp16_32_32_1_32", + "dtype": np.float16, + "src0_shape": (32, 32), + "src1_shape": (1, 32), + "shape": (32, 32), + "valid_shape": (32, 32), + "eps": 1e-2, + }, + { + "name": "fp16_16_128_1_128", + "dtype": np.float16, + "src0_shape": (16, 128), + "src1_shape": (1, 128), + "shape": (16, 128), + "valid_shape": (16, 128), + "eps": 1e-2, + }, +] + +_SMOKE_CASE_NAMES = ['fp32_32_16_1_16', 'fp16_32_32_1_32'] +_SMOKE_CASE_NAME_SET = set(_SMOKE_CASE_NAMES) +_missing = [name for name in _SMOKE_CASE_NAMES if name not in {case["name"] for case in CASES}] +if _missing: + raise RuntimeError("unknown smoke case(s): " + ", ".join(_missing)) +CASES = [case for case in CASES if case["name"] in _SMOKE_CASE_NAME_SET] diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandexpdif/compare.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandexpdif/compare.py new file mode 100644 index 000000000..b8ddb1131 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandexpdif/compare.py @@ -0,0 +1,56 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You can not use this file in the compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import os +import sys +import numpy as np + +from cases import CASES +from st_common import result_cmp, style_fail, style_pass, validate_cases + + +def main(): + validate_cases(CASES) + case_filter = sys.argv[1] if len(sys.argv) > 1 else None + + all_passed = True + for case in CASES: + if case_filter is not None and case["name"] != case_filter: + continue + + case_dir = case["name"] + shape = case["shape"] + vr, vc = case["valid_shape"] + + golden = np.fromfile(os.path.join(case_dir, "golden.bin"), dtype=case["dtype"]).reshape(shape) + output = np.fromfile(os.path.join(case_dir, "output.bin"), dtype=case["dtype"]).reshape(shape) + + ok = result_cmp(golden[:vr, :vc], output[:vr, :vc], case["eps"]) + if ok: + print(style_pass(f"[INFO] {case['name']}: compare passed")) + else: + print(style_fail(f"[ERROR] {case['name']}: compare failed")) + all_passed = False + + if not all_passed: + sys.exit(2) + print(style_pass("[INFO] all cases passed")) + + +if __name__ == "__main__": + main() diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandexpdif/gen_data.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandexpdif/gen_data.py new file mode 100644 index 000000000..f02eb9ffb --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandexpdif/gen_data.py @@ -0,0 +1,39 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import numpy as np +from cases import CASES +from st_common import validate_cases, setup_case_rng, save_case_data + +validate_cases(CASES) + +for case in CASES: + setup_case_rng(case) + + dtype = case["dtype"] + src0_shape = case["src0_shape"] + dst_shape = case["shape"] + src1_shape = case["src1_shape"] + valid_shape = case["valid_shape"] + + src0 = np.random.uniform(-255, 255, size=src0_shape).astype(dtype) + src1 = np.random.uniform(1, 255, size=src1_shape).astype(dtype) + + dst_row, dst_col = dst_shape + src1_row = src1_shape[0] + reps = (dst_row + src1_row - 1) // src1_row + + expanded_src1 = np.tile(src1, (reps, 1))[:dst_row, :dst_col] + golden = np.exp((src0.astype(np.float64) - expanded_src1.astype(np.float64))) + golden = golden.astype(dtype) + + save_case_data(case["name"], {"input0": src0, "input1": src1, "golden": golden}) + print(f"[INFO] gen_data: {case['name']} src0={src0_shape} src1={src1_shape} dst={dst_shape} dtype={dtype.__name__}") diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandexpdif/launch.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandexpdif/launch.cpp new file mode 100644 index 000000000..9e3f3c3f6 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandexpdif/launch.cpp @@ -0,0 +1,28 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +#include + +#ifndef AICORE +#define AICORE [aicore] +#endif + +// Case 1: fp32_32_16_1_16 + +extern "C" __global__ AICORE void TCOLEXPANDEXPDIF_fp32_32_16_1_16(__gm__ float *src0, __gm__ float *src1, __gm__ float *dst); +extern "C" __global__ AICORE void TCOLEXPANDEXPDIF_fp16_32_32_1_32(__gm__ uint16_t *src0, __gm__ uint16_t *src1, __gm__ uint16_t *dst); + +void LaunchTCOLEXPANDEXPDIF_fp32_32_16_1_16(float *src0, float *src1, float *dst, void *stream) { + TCOLEXPANDEXPDIF_fp32_32_16_1_16<<<1, nullptr, stream>>>((__gm__ float *)src0, (__gm__ float *)src1, (__gm__ float *)dst); +} + + + +void LaunchTCOLEXPANDEXPDIF_fp16_32_32_1_32(uint16_t *src0, uint16_t *src1, uint16_t *dst, void *stream) { + TCOLEXPANDEXPDIF_fp16_32_32_1_32<<<1, nullptr, stream>>>((__gm__ uint16_t *)src0, (__gm__ uint16_t *)src1, (__gm__ uint16_t *)dst); +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandexpdif/main.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandexpdif/main.cpp new file mode 100644 index 000000000..ecb503026 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandexpdif/main.cpp @@ -0,0 +1,155 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// Host driver for TileLang tcolexpandexpdif ST +// Test cases match PTO-ISA: /home/zhoushaofan/code/pto-isa/tests/npu/a5/src/st/testcase/tcolexpandexpdif/ +// TCOLEXPANDEXPDIF: compute exp(src0) - exp(tiled_src1) + +#include "acl/acl.h" +#include "test_common.h" +#include +#include +#include +#include +#include +#include + +using namespace PtoTestCommon; + +// Kernel launch wrappers (defined in launch.cpp) +void LaunchTCOLEXPANDEXPDIF_fp32_32_16_1_16(float *src0, float *src1, float *dst, void *stream); +void LaunchTCOLEXPANDEXPDIF_fp32_16_32_1_32(float *src0, float *src1, float *dst, void *stream); +void LaunchTCOLEXPANDEXPDIF_fp16_32_32_1_32(uint16_t *src0, uint16_t *src1, uint16_t *dst, void *stream); +void LaunchTCOLEXPANDEXPDIF_fp16_16_128_1_128(uint16_t *src0, uint16_t *src1, uint16_t *dst, void *stream); + +using LaunchFn = void (*)(void *, void *, void *, void *); + +struct TestCase { + const char *name; + LaunchFn launch; + size_t src0Rows; + size_t src0Cols; + size_t src1Rows; + size_t src1Cols; + size_t dstRows; + size_t dstCols; + size_t validRows; + size_t validCols; + size_t elemSize; +}; + +static const TestCase kCases[] = { +{"fp32_32_16_1_16", (LaunchFn)LaunchTCOLEXPANDEXPDIF_fp32_32_16_1_16, 32, 16, 1, 16, 32, 16, 32, 16, sizeof(float)}, +{"fp16_32_32_1_32", (LaunchFn)LaunchTCOLEXPANDEXPDIF_fp16_32_32_1_32, 32, 32, 1, 32, 32, 32, 32, 32, sizeof(uint16_t)}, +}; +static constexpr size_t kNumCases = sizeof(kCases) / sizeof(kCases[0]); + +static int RunCase(const TestCase &tc, int deviceId, aclrtStream stream) { + int rc = 0; + const size_t src0ElemCount = tc.src0Rows * tc.src0Cols; + const size_t src1ElemCount = tc.src1Rows * tc.src1Cols; + const size_t dstElemCount = tc.dstRows * tc.dstCols; + const size_t src0FileSize = src0ElemCount * tc.elemSize; + const size_t src1FileSize = src1ElemCount * tc.elemSize; + const size_t dstFileSize = dstElemCount * tc.elemSize; + + std::printf("[INFO] === case: %s (src0=%zux%zu, src1=%zux%zu -> dst=%zux%zu, valid=%zux%zu) ===\n", + tc.name, tc.src0Rows, tc.src0Cols, tc.src1Rows, tc.src1Cols, tc.dstRows, tc.dstCols, tc.validRows, tc.validCols); + + std::string caseDir = std::string("./") + tc.name; + + void *src0Host = nullptr, *src1Host = nullptr, *dstHost = nullptr; + void *src0Device = nullptr, *src1Device = nullptr, *dstDevice = nullptr; + + aclrtMallocHost((void **)(&src0Host), src0FileSize); + aclrtMallocHost((void **)(&src1Host), src1FileSize); + aclrtMallocHost((void **)(&dstHost), dstFileSize); + + aclrtMalloc((void **)&src0Device, src0FileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc((void **)&src1Device, src1FileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc((void **)&dstDevice, dstFileSize, ACL_MEM_MALLOC_HUGE_FIRST); + + size_t actualSrc0FileSize = src0FileSize; + size_t actualSrc1FileSize = src1FileSize; + + if (!ReadFile((caseDir + "/input0.bin").c_str(), actualSrc0FileSize, src0Host, src0FileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input0.bin\n", caseDir.c_str()); + rc = 1; + } + + if (rc == 0 && !ReadFile((caseDir + "/input1.bin").c_str(), actualSrc1FileSize, src1Host, src1FileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input1.bin\n", caseDir.c_str()); + rc = 1; + } + + if (rc == 0) { + aclrtMemcpy(src0Device, src0FileSize, src0Host, src0FileSize, ACL_MEMCPY_HOST_TO_DEVICE); + aclrtMemcpy(src1Device, src1FileSize, src1Host, src1FileSize, ACL_MEMCPY_HOST_TO_DEVICE); + + tc.launch(src0Device, src1Device, dstDevice, stream); + + aclrtSynchronizeStream(stream); + aclrtMemcpy(dstHost, dstFileSize, dstDevice, dstFileSize, ACL_MEMCPY_DEVICE_TO_HOST); + } + + if (rc == 0 && !WriteFile((caseDir + "/output.bin").c_str(), dstHost, dstFileSize)) { + std::fprintf(stderr, "[ERROR] failed to write %s/output.bin\n", caseDir.c_str()); + rc = 1; + } + + if (src0Device != nullptr) + aclrtFree(src0Device); + if (src1Device != nullptr) + aclrtFree(src1Device); + if (dstDevice != nullptr) + aclrtFree(dstDevice); + if (src0Host != nullptr) + aclrtFreeHost(src0Host); + if (src1Host != nullptr) + aclrtFreeHost(src1Host); + if (dstHost != nullptr) + aclrtFreeHost(dstHost); + + if (rc == 0) + std::printf("[INFO] case %s done\n", tc.name); + return rc; +} + +int main(int argc, char *argv[]) { + const char *caseFilter = (argc > 1) ? argv[1] : nullptr; + + int rc = 0; + int deviceId = 0; + aclrtStream stream = nullptr; + + aclInit(nullptr); + if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { + deviceId = std::atoi(envDevice); + } + aclrtSetDevice(deviceId); + aclrtCreateStream(&stream); + + for (size_t i = 0; i < kNumCases; ++i) { + if (caseFilter != nullptr && std::strcmp(kCases[i].name, caseFilter) != 0) { + continue; + } + int ret = RunCase(kCases[i], deviceId, stream); + if (ret != 0) { + std::fprintf(stderr, "[ERROR] case %s failed\n", kCases[i].name); + rc = 1; + break; + } + } + + if (stream != nullptr) + aclrtDestroyStream(stream); + aclrtResetDevice(deviceId); + aclFinalize(); + + return rc; +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandexpdif/tcolexpandexpdif.pto b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandexpdif/tcolexpandexpdif.pto new file mode 100644 index 000000000..b4fe2f3c6 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandexpdif/tcolexpandexpdif.pto @@ -0,0 +1,139 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// TileLang ST kernels for pto.tcolexpandexpdif: compute exp(src0) - exp(tiled_src1). +// Matches PTO-ISA testcase parameters. +// Key: tile_buf cols = full tensor width, v_col = valid portion + +module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { + // Case 1: fp32_32_16_1_16 (float32, src0=(32,16), src1=(1,16), dst=(32,16)) + func.func @TCOLEXPANDEXPDIF_fp32_32_16_1_16(%src0_ptr: !pto.ptr, %src1_ptr: !pto.ptr, %dst_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c16 = arith.constant 16 : index + %c32 = arith.constant 32 : index + %c512 = arith.constant 512 : index + + %src0_view = pto.make_tensor_view %src0_ptr, + shape = [%c1, %c1, %c1, %c32, %c16], + strides = [%c512, %c512, %c512, %c16, %c1] + : !pto.tensor_view<1x1x1x32x16xf32> + + %src1_view = pto.make_tensor_view %src1_ptr, + shape = [%c1, %c1, %c1, %c1, %c16], + strides = [%c16, %c16, %c16, %c16, %c1] + : !pto.tensor_view<1x1x1x1x16xf32> + + %dst_view = pto.make_tensor_view %dst_ptr, + shape = [%c1, %c1, %c1, %c32, %c16], + strides = [%c512, %c512, %c512, %c16, %c1] + : !pto.tensor_view<1x1x1x32x16xf32> + + %src0_part = pto.partition_view %src0_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c32, %c16] + : !pto.tensor_view<1x1x1x32x16xf32> -> !pto.partition_tensor_view<1x1x1x32x16xf32> + + %src1_part = pto.partition_view %src1_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c1, %c16] + : !pto.tensor_view<1x1x1x1x16xf32> -> !pto.partition_tensor_view<1x1x1x1x16xf32> + + %dst_part = pto.partition_view %dst_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c32, %c16] + : !pto.tensor_view<1x1x1x32x16xf32> -> !pto.partition_tensor_view<1x1x1x32x16xf32> + + %src0_tile = pto.alloc_tile + : !pto.tile_buf + + %src1_tile = pto.alloc_tile + : !pto.tile_buf + + %dst_tile = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%src0_part : !pto.partition_tensor_view<1x1x1x32x16xf32>) + outs(%src0_tile : !pto.tile_buf) + + pto.tload ins(%src1_part : !pto.partition_tensor_view<1x1x1x1x16xf32>) + outs(%src1_tile : !pto.tile_buf) + + pto.tcolexpandexpdif ins(%src0_tile, %src1_tile : !pto.tile_buf, + !pto.tile_buf) + outs(%dst_tile : !pto.tile_buf) + + pto.tstore ins(%dst_tile : !pto.tile_buf) + outs(%dst_part : !pto.partition_tensor_view<1x1x1x32x16xf32>) + return + } + + // Case 2: fp32_16_32_1_32 (float32, src0=(16,32), src1=(1,32), dst=(16,32)) + + func.func @TCOLEXPANDEXPDIF_fp16_32_32_1_32(%src0_ptr: !pto.ptr, %src1_ptr: !pto.ptr, %dst_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c32 = arith.constant 32 : index + %c1024 = arith.constant 1024 : index + + %src0_view = pto.make_tensor_view %src0_ptr, + shape = [%c1, %c1, %c1, %c32, %c32], + strides = [%c1024, %c1024, %c1024, %c32, %c1] + : !pto.tensor_view<1x1x1x32x32xf16> + + %src1_view = pto.make_tensor_view %src1_ptr, + shape = [%c1, %c1, %c1, %c1, %c32], + strides = [%c32, %c32, %c32, %c32, %c1] + : !pto.tensor_view<1x1x1x1x32xf16> + + %dst_view = pto.make_tensor_view %dst_ptr, + shape = [%c1, %c1, %c1, %c32, %c32], + strides = [%c1024, %c1024, %c1024, %c32, %c1] + : !pto.tensor_view<1x1x1x32x32xf16> + + %src0_part = pto.partition_view %src0_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c32, %c32] + : !pto.tensor_view<1x1x1x32x32xf16> -> !pto.partition_tensor_view<1x1x1x32x32xf16> + + %src1_part = pto.partition_view %src1_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c1, %c32] + : !pto.tensor_view<1x1x1x1x32xf16> -> !pto.partition_tensor_view<1x1x1x1x32xf16> + + %dst_part = pto.partition_view %dst_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c32, %c32] + : !pto.tensor_view<1x1x1x32x32xf16> -> !pto.partition_tensor_view<1x1x1x32x32xf16> + + %src0_tile = pto.alloc_tile + : !pto.tile_buf + + %src1_tile = pto.alloc_tile + : !pto.tile_buf + + %dst_tile = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%src0_part : !pto.partition_tensor_view<1x1x1x32x32xf16>) + outs(%src0_tile : !pto.tile_buf) + + pto.tload ins(%src1_part : !pto.partition_tensor_view<1x1x1x1x32xf16>) + outs(%src1_tile : !pto.tile_buf) + + pto.tcolexpandexpdif ins(%src0_tile, %src1_tile : !pto.tile_buf, + !pto.tile_buf) + outs(%dst_tile : !pto.tile_buf) + + pto.tstore ins(%dst_tile : !pto.tile_buf) + outs(%dst_part : !pto.partition_tensor_view<1x1x1x32x32xf16>) + return + } + + // Case 4: fp16_16_128_1_128 (float16, src0=(16,128), src1=(1,128), dst=(16,128)) +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandmax/CMakeLists.txt b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandmax/CMakeLists.txt new file mode 100644 index 000000000..5611449fb --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandmax/CMakeLists.txt @@ -0,0 +1,16 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file in the compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +pto_tilelang_vec_st(tcolexpandmax) diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandmax/cases.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandmax/cases.py new file mode 100644 index 000000000..e4e63bb8d --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandmax/cases.py @@ -0,0 +1,98 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +"""Single source of truth for tcolexpandmax ST test cases. +Matches PTO-ISA testcase definitions in /home/zhoushaofan/code/pto-isa/tests/npu/a5/src/st/testcase/tcolexpandmax/ + +TCOLEXPANDMAX: compute elementwise maximum of src0 and tiled src1. + - src0_shape: (src0_row, cols) - first input tile + - src1_shape: (1, cols) - second input tile (single row, broadcasted) + - dst_shape: (dst_row, cols) - output tile + - shape: (dst_row, cols) - alias of dst_shape, for compare.py compatibility + - valid_shape: (valid_row, valid_col) - effective computation region + - reps: number of times to tile src1 (equals src0_row) + +Golden: np.maximum(src0, np.tile(src1, (reps, 1))[:, :dst_col]) + +Case naming: {dtype}_{src0_row}_{src0_col}_{src1_row}_{dst_col} +""" + +import numpy as np + +CASES = [ + { + "name": "fp32_32_32_1_32", + "dtype": np.float32, + "src0_shape": (32, 32), + "src1_shape": (1, 32), + "shape": (32, 32), + "valid_shape": (32, 32), + "reps": 32, + "eps": 1e-6, + }, + { + "name": "fp32_16_128_1_128", + "dtype": np.float32, + "src0_shape": (16, 128), + "src1_shape": (1, 128), + "shape": (16, 128), + "valid_shape": (16, 128), + "reps": 16, + "eps": 1e-6, + }, + { + "name": "fp16_4_256_1_256", + "dtype": np.float16, + "src0_shape": (4, 256), + "src1_shape": (1, 256), + "shape": (4, 256), + "valid_shape": (4, 256), + "reps": 4, + "eps": 1e-3, + }, + { + "name": "fp16_10_64_1_64", + "dtype": np.float16, + "src0_shape": (10, 64), + "src1_shape": (1, 64), + "shape": (10, 64), + "valid_shape": (10, 64), + "reps": 10, + "eps": 1e-3, + }, + { + "name": "int32_16_32_1_32", + "dtype": np.int32, + "src0_shape": (16, 32), + "src1_shape": (1, 32), + "shape": (16, 32), + "valid_shape": (16, 32), + "reps": 16, + "eps": 0, + }, + { + "name": "int16_16_64_1_64", + "dtype": np.int16, + "src0_shape": (16, 64), + "src1_shape": (1, 64), + "shape": (16, 64), + "valid_shape": (16, 64), + "reps": 16, + "eps": 0, + }, +] + +_SMOKE_CASE_NAMES = ['fp16_10_64_1_64', 'int32_16_32_1_32'] +_SMOKE_CASE_NAME_SET = set(_SMOKE_CASE_NAMES) +_missing = [name for name in _SMOKE_CASE_NAMES if name not in {case["name"] for case in CASES}] +if _missing: + raise RuntimeError("unknown smoke case(s): " + ", ".join(_missing)) +CASES = [case for case in CASES if case["name"] in _SMOKE_CASE_NAME_SET] diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandmax/compare.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandmax/compare.py new file mode 100644 index 000000000..b8ddb1131 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandmax/compare.py @@ -0,0 +1,56 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You can not use this file in the compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import os +import sys +import numpy as np + +from cases import CASES +from st_common import result_cmp, style_fail, style_pass, validate_cases + + +def main(): + validate_cases(CASES) + case_filter = sys.argv[1] if len(sys.argv) > 1 else None + + all_passed = True + for case in CASES: + if case_filter is not None and case["name"] != case_filter: + continue + + case_dir = case["name"] + shape = case["shape"] + vr, vc = case["valid_shape"] + + golden = np.fromfile(os.path.join(case_dir, "golden.bin"), dtype=case["dtype"]).reshape(shape) + output = np.fromfile(os.path.join(case_dir, "output.bin"), dtype=case["dtype"]).reshape(shape) + + ok = result_cmp(golden[:vr, :vc], output[:vr, :vc], case["eps"]) + if ok: + print(style_pass(f"[INFO] {case['name']}: compare passed")) + else: + print(style_fail(f"[ERROR] {case['name']}: compare failed")) + all_passed = False + + if not all_passed: + sys.exit(2) + print(style_pass("[INFO] all cases passed")) + + +if __name__ == "__main__": + main() diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandmax/gen_data.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandmax/gen_data.py new file mode 100644 index 000000000..7b5dbff39 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandmax/gen_data.py @@ -0,0 +1,41 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file in the compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import numpy as np +from cases import CASES +from st_common import validate_cases, setup_case_rng, save_case_data + +validate_cases(CASES) + +for case in CASES: + setup_case_rng(case) + + dtype = case["dtype"] + src0_shape = case["src0_shape"] + dst_shape = case["shape"] + src1_shape = case["src1_shape"] + valid_shape = case["valid_shape"] + reps = case["reps"] + + src0 = np.random.randint(1, 10, size=src0_shape).astype(dtype) + src1 = np.random.randint(1, 10, size=src1_shape).astype(dtype) + + golden = np.maximum(src0, np.tile(src1, (reps, 1))[:, :dst_shape[1]]) + + save_case_data(case["name"], {"input0": src0, "input1": src1, "golden": golden}) + print(f"[INFO] gen_data: {case['name']} src0={src0_shape} src1={src1_shape} dst={dst_shape} valid={valid_shape} dtype={dtype.__name__}") diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandmax/launch.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandmax/launch.cpp new file mode 100644 index 000000000..c1901917d --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandmax/launch.cpp @@ -0,0 +1,27 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +#include + +#ifndef AICORE +#define AICORE [aicore] +#endif + +// Case 1: fp32_16_128_1_128 + +extern "C" __global__ AICORE void TCOLEXPANDMAX_fp16_10_64_1_64(__gm__ uint16_t *src0, __gm__ uint16_t *src1, __gm__ uint16_t *dst); +extern "C" __global__ AICORE void TCOLEXPANDMAX_int32_16_32_1_32(__gm__ int32_t *src0, __gm__ int32_t *src1, __gm__ int32_t *dst); + +void LaunchTCOLEXPANDMAX_fp16_10_64_1_64(uint16_t *src0, uint16_t *src1, uint16_t *dst, void *stream) { + TCOLEXPANDMAX_fp16_10_64_1_64<<<1, nullptr, stream>>>((__gm__ uint16_t *)src0, (__gm__ uint16_t *)src1, (__gm__ uint16_t *)dst); +} + + +void LaunchTCOLEXPANDMAX_int32_16_32_1_32(int32_t *src0, int32_t *src1, int32_t *dst, void *stream) { + TCOLEXPANDMAX_int32_16_32_1_32<<<1, nullptr, stream>>>((__gm__ int32_t *)src0, (__gm__ int32_t *)src1, (__gm__ int32_t *)dst); +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandmax/main.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandmax/main.cpp new file mode 100644 index 000000000..003901927 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandmax/main.cpp @@ -0,0 +1,161 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file in the compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// Host driver for TileLang tcolexpandmax ST +// Test cases match PTO-ISA: /home/zhoushaofan/code/pto-isa/tests/npu/a5/src/st/testcase/tcolexpandmax/ +// TCOLEXPANDMAX: elementwise maximum of src0 and tiled src1 + +#include "acl/acl.h" +#include "test_common.h" +#include +#include +#include +#include +#include +#include + +using namespace PtoTestCommon; + +// Kernel launch wrappers (defined in launch.cpp) +void LaunchTCOLEXPANDMAX_fp32_32_32_1_32(float *src0, float *src1, float *dst, void *stream); +void LaunchTCOLEXPANDMAX_fp16_10_64_1_64(uint16_t *src0, uint16_t *src1, uint16_t *dst, void *stream); +void LaunchTCOLEXPANDMAX_int32_16_32_1_32(int32_t *src0, int32_t *src1, int32_t *dst, void *stream); +void LaunchTCOLEXPANDMAX_int16_16_64_1_64(int16_t *src0, int16_t *src1, int16_t *dst, void *stream); + +using LaunchFn = void (*)(void *, void *, void *, void *); + +struct TestCase { + const char *name; + LaunchFn launch; + size_t src0Rows; + size_t src0Cols; + size_t src1Rows; + size_t src1Cols; + size_t dstRows; + size_t dstCols; + size_t validRows; + size_t validCols; + size_t elemSize; +}; + +static const TestCase kCases[] = { +{"fp16_10_64_1_64", (LaunchFn)LaunchTCOLEXPANDMAX_fp16_10_64_1_64, 10, 64, 1, 64, 10, 64, 10, 64, sizeof(uint16_t)}, +{"int32_16_32_1_32", (LaunchFn)LaunchTCOLEXPANDMAX_int32_16_32_1_32, 16, 32, 1, 32, 16, 32, 16, 32, sizeof(int32_t)}, +}; +static constexpr size_t kNumCases = sizeof(kCases) / sizeof(kCases[0]); + +static int RunCase(const TestCase &tc, int deviceId, aclrtStream stream) { + int rc = 0; + const size_t src0ElemCount = tc.src0Rows * tc.src0Cols; + const size_t src1ElemCount = tc.src1Rows * tc.src1Cols; + const size_t dstElemCount = tc.dstRows * tc.dstCols; + const size_t src0FileSize = src0ElemCount * tc.elemSize; + const size_t src1FileSize = src1ElemCount * tc.elemSize; + const size_t dstFileSize = dstElemCount * tc.elemSize; + + std::printf("[INFO] === case: %s (src0=%zux%zu, src1=%zux%zu -> dst=%zux%zu, valid=%zux%zu) ===\n", + tc.name, tc.src0Rows, tc.src0Cols, tc.src1Rows, tc.src1Cols, tc.dstRows, tc.dstCols, tc.validRows, tc.validCols); + + std::string caseDir = std::string("./") + tc.name; + size_t actualSrc0FileSize = src0FileSize; + size_t actualSrc1FileSize = src1FileSize; + + void *src0Host = nullptr, *src1Host = nullptr, *dstHost = nullptr; + void *src0Device = nullptr, *src1Device = nullptr, *dstDevice = nullptr; + + aclrtMallocHost((void **)(&src0Host), src0FileSize); + aclrtMallocHost((void **)(&src1Host), src1FileSize); + aclrtMallocHost((void **)(&dstHost), dstFileSize); + + aclrtMalloc((void **)&src0Device, src0FileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc((void **)&src1Device, src1FileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc((void **)&dstDevice, dstFileSize, ACL_MEM_MALLOC_HUGE_FIRST); + + if (!ReadFile((caseDir + "/input0.bin").c_str(), actualSrc0FileSize, src0Host, src0FileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input0.bin\n", caseDir.c_str()); + rc = 1; + } + + if (rc == 0 && !ReadFile((caseDir + "/input1.bin").c_str(), actualSrc1FileSize, src1Host, src1FileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input1.bin\n", caseDir.c_str()); + rc = 1; + } + + if (rc == 0) { + aclrtMemcpy(src0Device, src0FileSize, src0Host, src0FileSize, ACL_MEMCPY_HOST_TO_DEVICE); + aclrtMemcpy(src1Device, src1FileSize, src1Host, src1FileSize, ACL_MEMCPY_HOST_TO_DEVICE); + + tc.launch(src0Device, src1Device, dstDevice, stream); + + aclrtSynchronizeStream(stream); + aclrtMemcpy(dstHost, dstFileSize, dstDevice, dstFileSize, ACL_MEMCPY_DEVICE_TO_HOST); + } + + if (rc == 0 && !WriteFile((caseDir + "/output.bin").c_str(), dstHost, dstFileSize)) { + std::fprintf(stderr, "[ERROR] failed to write %s/output.bin\n", caseDir.c_str()); + rc = 1; + } + + if (src0Device != nullptr) + aclrtFree(src0Device); + if (src1Device != nullptr) + aclrtFree(src1Device); + if (dstDevice != nullptr) + aclrtFree(dstDevice); + if (src0Host != nullptr) + aclrtFreeHost(src0Host); + if (src1Host != nullptr) + aclrtFreeHost(src1Host); + if (dstHost != nullptr) + aclrtFreeHost(dstHost); + + if (rc == 0) + std::printf("[INFO] case %s done\n", tc.name); + return rc; +} + +int main(int argc, char *argv[]) { + const char *caseFilter = (argc > 1) ? argv[1] : nullptr; + + int rc = 0; + int deviceId = 0; + aclrtStream stream = nullptr; + + aclInit(nullptr); + if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { + deviceId = std::atoi(envDevice); + } + aclrtSetDevice(deviceId); + aclrtCreateStream(&stream); + + for (size_t i = 0; i < kNumCases; ++i) { + if (caseFilter != nullptr && std::strcmp(kCases[i].name, caseFilter) != 0) { + continue; + } + int ret = RunCase(kCases[i], deviceId, stream); + if (ret != 0) { + std::fprintf(stderr, "[ERROR] case %s failed\n", kCases[i].name); + rc = 1; + break; + } + } + + if (stream != nullptr) + aclrtDestroyStream(stream); + aclrtResetDevice(deviceId); + aclFinalize(); + + return rc; +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandmax/tcolexpandmax.pto b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandmax/tcolexpandmax.pto new file mode 100644 index 000000000..49b6fcf59 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandmax/tcolexpandmax.pto @@ -0,0 +1,140 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file in the compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// TileLang ST kernels for pto.tcolexpandmax: elementwise maximum of src0 and tiled src1. +// Matches PTO-ISA testcase parameters. +// Key: tile_buf cols = full tensor width, v_col = valid portion + +module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { + // Case 1: fp32_16_128_1_128 (float32, src0=(16,128), src1=(1,128), dst=(16,128)) + func.func @TCOLEXPANDMAX_fp16_10_64_1_64(%src0_ptr: !pto.ptr, %src1_ptr: !pto.ptr, %dst_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c10 = arith.constant 10 : index + %c64 = arith.constant 64 : index + %c640 = arith.constant 640 : index + + %src0_view = pto.make_tensor_view %src0_ptr, + shape = [%c1, %c1, %c1, %c10, %c64], + strides = [%c640, %c640, %c640, %c64, %c1] + : !pto.tensor_view<1x1x1x10x64xf16> + + %src1_view = pto.make_tensor_view %src1_ptr, + shape = [%c1, %c1, %c1, %c1, %c64], + strides = [%c64, %c64, %c64, %c64, %c1] + : !pto.tensor_view<1x1x1x1x64xf16> + + %dst_view = pto.make_tensor_view %dst_ptr, + shape = [%c1, %c1, %c1, %c10, %c64], + strides = [%c640, %c640, %c640, %c64, %c1] + : !pto.tensor_view<1x1x1x10x64xf16> + + %src0_part = pto.partition_view %src0_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c10, %c64] + : !pto.tensor_view<1x1x1x10x64xf16> -> !pto.partition_tensor_view<1x1x1x10x64xf16> + + %src1_part = pto.partition_view %src1_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c1, %c64] + : !pto.tensor_view<1x1x1x1x64xf16> -> !pto.partition_tensor_view<1x1x1x1x64xf16> + + %dst_part = pto.partition_view %dst_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c10, %c64] + : !pto.tensor_view<1x1x1x10x64xf16> -> !pto.partition_tensor_view<1x1x1x10x64xf16> + + %src0 = pto.alloc_tile + : !pto.tile_buf + + %src1 = pto.alloc_tile + : !pto.tile_buf + + %dst = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%src0_part : !pto.partition_tensor_view<1x1x1x10x64xf16>) + outs(%src0 : !pto.tile_buf) + + pto.tload ins(%src1_part : !pto.partition_tensor_view<1x1x1x1x64xf16>) + outs(%src1 : !pto.tile_buf) + + pto.tcolexpandmax ins(%src0, %src1 : !pto.tile_buf, + !pto.tile_buf) + outs(%dst : !pto.tile_buf) + +pto.tstore ins(%dst : !pto.tile_buf) + outs(%dst_part : !pto.partition_tensor_view<1x1x1x10x64xf16>) + return + } + + // Case 5: int32_16_32_1_32 (int32, src0=(16,32), src1=(1,32), dst=(16,32)) + + func.func @TCOLEXPANDMAX_int32_16_32_1_32(%src0_ptr: !pto.ptr, %src1_ptr: !pto.ptr, %dst_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c16 = arith.constant 16 : index + %c32 = arith.constant 32 : index + %c512 = arith.constant 512 : index + + %src0_view = pto.make_tensor_view %src0_ptr, + shape = [%c1, %c1, %c1, %c16, %c32], + strides = [%c512, %c512, %c512, %c32, %c1] + : !pto.tensor_view<1x1x1x16x32xi32> + + %src1_view = pto.make_tensor_view %src1_ptr, + shape = [%c1, %c1, %c1, %c1, %c32], + strides = [%c32, %c32, %c32, %c32, %c1] + : !pto.tensor_view<1x1x1x1x32xi32> + + %dst_view = pto.make_tensor_view %dst_ptr, + shape = [%c1, %c1, %c1, %c16, %c32], + strides = [%c512, %c512, %c512, %c32, %c1] + : !pto.tensor_view<1x1x1x16x32xi32> + + %src0_part = pto.partition_view %src0_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c16, %c32] + : !pto.tensor_view<1x1x1x16x32xi32> -> !pto.partition_tensor_view<1x1x1x16x32xi32> + + %src1_part = pto.partition_view %src1_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c1, %c32] + : !pto.tensor_view<1x1x1x1x32xi32> -> !pto.partition_tensor_view<1x1x1x1x32xi32> + + %dst_part = pto.partition_view %dst_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c16, %c32] + : !pto.tensor_view<1x1x1x16x32xi32> -> !pto.partition_tensor_view<1x1x1x16x32xi32> + + %src0 = pto.alloc_tile + : !pto.tile_buf + + %src1 = pto.alloc_tile + : !pto.tile_buf + + %dst = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%src0_part : !pto.partition_tensor_view<1x1x1x16x32xi32>) + outs(%src0 : !pto.tile_buf) + + pto.tload ins(%src1_part : !pto.partition_tensor_view<1x1x1x1x32xi32>) + outs(%src1 : !pto.tile_buf) + + pto.tcolexpandmax ins(%src0, %src1 : !pto.tile_buf, + !pto.tile_buf) + outs(%dst : !pto.tile_buf) + + pto.tstore ins(%dst : !pto.tile_buf) + outs(%dst_part : !pto.partition_tensor_view<1x1x1x16x32xi32>) + return + } + + // Case 6: int16_16_64_1_64 (int16, src0=(16,64), src1=(1,64), dst=(16,64)) +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandmin/CMakeLists.txt b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandmin/CMakeLists.txt new file mode 100644 index 000000000..c9daf2288 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandmin/CMakeLists.txt @@ -0,0 +1,16 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file in the compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +pto_tilelang_vec_st(tcolexpandmin) diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandmin/cases.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandmin/cases.py new file mode 100644 index 000000000..230e4ba2f --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandmin/cases.py @@ -0,0 +1,104 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file in the compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +"""Single source of truth for tcolexpandmin ST test cases. + +TCOLEXPANDMIN: compute elementwise minimum of src0 and tiled src1. + - src0_shape: (src0_row, cols) - first input tile + - src1_shape: (1, cols) - second input tile (single row, broadcasted) + - dst_shape: (dst_row, cols) - output tile + - shape: (dst_row, cols) - alias of dst_shape, for compare.py compatibility + - valid_shape: (valid_row, valid_col) - effective computation region + - reps: number of times to tile src1 (equals src0_row) + +Golden: np.minimum(src0, np.tile(src1, (reps, 1))[:, :dst_col]) + +Case naming: {dtype}_{src0_row}_{src0_col}_{src1_row}_{dst_col} +""" + +import numpy as np + +CASES = [ + { + "name": "fp32_32_32_1_32", + "dtype": np.float32, + "src0_shape": (32, 32), + "src1_shape": (1, 32), + "shape": (32, 32), + "valid_shape": (32, 32), + "reps": 32, + "eps": 1e-6, + }, + { + "name": "fp32_16_128_1_128", + "dtype": np.float32, + "src0_shape": (16, 128), + "src1_shape": (1, 128), + "shape": (16, 128), + "valid_shape": (16, 128), + "reps": 16, + "eps": 1e-6, + }, + { + "name": "fp16_4_256_1_256", + "dtype": np.float16, + "src0_shape": (4, 256), + "src1_shape": (1, 256), + "shape": (4, 256), + "valid_shape": (4, 256), + "reps": 4, + "eps": 1e-3, + }, + { + "name": "fp16_10_64_1_64", + "dtype": np.float16, + "src0_shape": (10, 64), + "src1_shape": (1, 64), + "shape": (10, 64), + "valid_shape": (10, 64), + "reps": 10, + "eps": 1e-3, + }, + { + "name": "int32_16_32_1_32", + "dtype": np.int32, + "src0_shape": (16, 32), + "src1_shape": (1, 32), + "shape": (16, 32), + "valid_shape": (16, 32), + "reps": 16, + "eps": 0, + }, + { + "name": "int16_16_64_1_64", + "dtype": np.int16, + "src0_shape": (16, 64), + "src1_shape": (1, 64), + "shape": (16, 64), + "valid_shape": (16, 64), + "reps": 16, + "eps": 0, + }, +] + +_SMOKE_CASE_NAMES = ['fp16_10_64_1_64', 'int32_16_32_1_32'] +_SMOKE_CASE_NAME_SET = set(_SMOKE_CASE_NAMES) +_missing = [name for name in _SMOKE_CASE_NAMES if name not in {case["name"] for case in CASES}] +if _missing: + raise RuntimeError("unknown smoke case(s): " + ", ".join(_missing)) +CASES = [case for case in CASES if case["name"] in _SMOKE_CASE_NAME_SET] diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandmin/compare.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandmin/compare.py new file mode 100644 index 000000000..b8ddb1131 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandmin/compare.py @@ -0,0 +1,56 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You can not use this file in the compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import os +import sys +import numpy as np + +from cases import CASES +from st_common import result_cmp, style_fail, style_pass, validate_cases + + +def main(): + validate_cases(CASES) + case_filter = sys.argv[1] if len(sys.argv) > 1 else None + + all_passed = True + for case in CASES: + if case_filter is not None and case["name"] != case_filter: + continue + + case_dir = case["name"] + shape = case["shape"] + vr, vc = case["valid_shape"] + + golden = np.fromfile(os.path.join(case_dir, "golden.bin"), dtype=case["dtype"]).reshape(shape) + output = np.fromfile(os.path.join(case_dir, "output.bin"), dtype=case["dtype"]).reshape(shape) + + ok = result_cmp(golden[:vr, :vc], output[:vr, :vc], case["eps"]) + if ok: + print(style_pass(f"[INFO] {case['name']}: compare passed")) + else: + print(style_fail(f"[ERROR] {case['name']}: compare failed")) + all_passed = False + + if not all_passed: + sys.exit(2) + print(style_pass("[INFO] all cases passed")) + + +if __name__ == "__main__": + main() diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandmin/gen_data.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandmin/gen_data.py new file mode 100644 index 000000000..a50ad17a7 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandmin/gen_data.py @@ -0,0 +1,42 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file in the compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import numpy as np +from cases import CASES +from st_common import validate_cases, setup_case_rng, save_case_data + +validate_cases(CASES) + +for case in CASES: + setup_case_rng(case) + + dtype = case["dtype"] + src0_shape = case["src0_shape"] + dst_shape = case["shape"] + src1_shape = case["src1_shape"] + valid_shape = case["valid_shape"] + reps = case["reps"] + + src0 = np.random.randint(1, 10, size=src0_shape).astype(dtype) + src1 = np.random.randint(1, 10, size=src1_shape).astype(dtype) + + golden = np.minimum(src0, np.tile(src1, (reps, 1))[:, :dst_shape[1]]) + golden = golden.astype(dtype) + + save_case_data(case["name"], {"input0": src0, "input1": src1, "golden": golden}) + print(f"[INFO] gen_data: {case['name']} src0={src0_shape} src1={src1_shape} dst={dst_shape} valid={valid_shape} dtype={dtype.__name__}") diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandmin/launch.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandmin/launch.cpp new file mode 100644 index 000000000..52f01325c --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandmin/launch.cpp @@ -0,0 +1,27 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +#include + +#ifndef AICORE +#define AICORE [aicore] +#endif + +// Case 1: fp32_16_128_1_128 + +extern "C" __global__ AICORE void TCOLEXPANDMIN_fp16_10_64_1_64(__gm__ uint16_t *src0, __gm__ uint16_t *src1, __gm__ uint16_t *dst); +extern "C" __global__ AICORE void TCOLEXPANDMIN_int32_16_32_1_32(__gm__ int32_t *src0, __gm__ int32_t *src1, __gm__ int32_t *dst); + +void LaunchTCOLEXPANDMIN_fp16_10_64_1_64(uint16_t *src0, uint16_t *src1, uint16_t *dst, void *stream) { + TCOLEXPANDMIN_fp16_10_64_1_64<<<1, nullptr, stream>>>((__gm__ uint16_t *)src0, (__gm__ uint16_t *)src1, (__gm__ uint16_t *)dst); +} + + +void LaunchTCOLEXPANDMIN_int32_16_32_1_32(int32_t *src0, int32_t *src1, int32_t *dst, void *stream) { + TCOLEXPANDMIN_int32_16_32_1_32<<<1, nullptr, stream>>>((__gm__ int32_t *)src0, (__gm__ int32_t *)src1, (__gm__ int32_t *)dst); +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandmin/main.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandmin/main.cpp new file mode 100644 index 000000000..70692510e --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandmin/main.cpp @@ -0,0 +1,161 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file in the compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// Host driver for TileLang tcolexpandmin ST +// Test cases match PTO-ISA +// TCOLEXPANDMIN: compute elementwise minimum of src0 and tiled src1 + +#include "acl/acl.h" +#include "test_common.h" +#include +#include +#include +#include +#include +#include + +using namespace PtoTestCommon; + +// Kernel launch wrappers (defined in launch.cpp) +void LaunchTCOLEXPANDMIN_fp32_32_32_1_32(float *src0, float *src1, float *dst, void *stream); +void LaunchTCOLEXPANDMIN_fp16_10_64_1_64(uint16_t *src0, uint16_t *src1, uint16_t *dst, void *stream); +void LaunchTCOLEXPANDMIN_int32_16_32_1_32(int32_t *src0, int32_t *src1, int32_t *dst, void *stream); +void LaunchTCOLEXPANDMIN_int16_16_64_1_64(int16_t *src0, int16_t *src1, int16_t *dst, void *stream); + +using LaunchFn = void (*)(void *, void *, void *, void *); + +struct TestCase { + const char *name; + LaunchFn launch; + size_t src0Rows; + size_t src0Cols; + size_t src1Rows; + size_t src1Cols; + size_t dstRows; + size_t dstCols; + size_t validRows; + size_t validCols; + size_t elemSize; +}; + +static const TestCase kCases[] = { +{"fp16_10_64_1_64", (LaunchFn)LaunchTCOLEXPANDMIN_fp16_10_64_1_64, 10, 64, 1, 64, 10, 64, 10, 64, sizeof(uint16_t)}, +{"int32_16_32_1_32", (LaunchFn)LaunchTCOLEXPANDMIN_int32_16_32_1_32, 16, 32, 1, 32, 16, 32, 16, 32, sizeof(int32_t)}, +}; +static constexpr size_t kNumCases = sizeof(kCases) / sizeof(kCases[0]); + +static int RunCase(const TestCase &tc, int deviceId, aclrtStream stream) { + int rc = 0; + const size_t src0ElemCount = tc.src0Rows * tc.src0Cols; + const size_t src1ElemCount = tc.src1Rows * tc.src1Cols; + const size_t dstElemCount = tc.dstRows * tc.dstCols; + const size_t src0FileSize = src0ElemCount * tc.elemSize; + const size_t src1FileSize = src1ElemCount * tc.elemSize; + const size_t dstFileSize = dstElemCount * tc.elemSize; + + std::printf("[INFO] === case: %s (src0=%zux%zu, src1=%zux%zu -> dst=%zux%zu, valid=%zux%zu) ===\n", + tc.name, tc.src0Rows, tc.src0Cols, tc.src1Rows, tc.src1Cols, + tc.dstRows, tc.dstCols, tc.validRows, tc.validCols); + + std::string caseDir = std::string("./") + tc.name; + size_t actualSrc0FileSize = src0FileSize; + size_t actualSrc1FileSize = src1FileSize; + + void *src0Host = nullptr, *src1Host = nullptr, *dstHost = nullptr; + void *src0Device = nullptr, *src1Device = nullptr, *dstDevice = nullptr; + + aclrtMallocHost((void **)(&src0Host), src0FileSize); + aclrtMallocHost((void **)(&src1Host), src1FileSize); + aclrtMallocHost((void **)(&dstHost), dstFileSize); + + aclrtMalloc((void **)&src0Device, src0FileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc((void **)&src1Device, src1FileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc((void **)&dstDevice, dstFileSize, ACL_MEM_MALLOC_HUGE_FIRST); + + if (!ReadFile((caseDir + "/input0.bin").c_str(), actualSrc0FileSize, src0Host, src0FileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input0.bin\n", caseDir.c_str()); + rc = 1; + } + if (rc == 0 && !ReadFile((caseDir + "/input1.bin").c_str(), actualSrc1FileSize, src1Host, src1FileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input1.bin\n", caseDir.c_str()); + rc = 1; + } + + if (rc == 0) { + aclrtMemcpy(src0Device, src0FileSize, src0Host, src0FileSize, ACL_MEMCPY_HOST_TO_DEVICE); + aclrtMemcpy(src1Device, src1FileSize, src1Host, src1FileSize, ACL_MEMCPY_HOST_TO_DEVICE); + + tc.launch(src0Device, src1Device, dstDevice, stream); + + aclrtSynchronizeStream(stream); + aclrtMemcpy(dstHost, dstFileSize, dstDevice, dstFileSize, ACL_MEMCPY_DEVICE_TO_HOST); + } + + if (rc == 0 && !WriteFile((caseDir + "/output.bin").c_str(), dstHost, dstFileSize)) { + std::fprintf(stderr, "[ERROR] failed to write %s/output.bin\n", caseDir.c_str()); + rc = 1; + } + + if (src0Device != nullptr) + aclrtFree(src0Device); + if (src1Device != nullptr) + aclrtFree(src1Device); + if (dstDevice != nullptr) + aclrtFree(dstDevice); + if (src0Host != nullptr) + aclrtFreeHost(src0Host); + if (src1Host != nullptr) + aclrtFreeHost(src1Host); + if (dstHost != nullptr) + aclrtFreeHost(dstHost); + + if (rc == 0) + std::printf("[INFO] case %s done\n", tc.name); + return rc; +} + +int main(int argc, char *argv[]) { + const char *caseFilter = (argc > 1) ? argv[1] : nullptr; + + int rc = 0; + int deviceId = 0; + aclrtStream stream = nullptr; + + aclInit(nullptr); + if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { + deviceId = std::atoi(envDevice); + } + aclrtSetDevice(deviceId); + aclrtCreateStream(&stream); + + for (size_t i = 0; i < kNumCases; ++i) { + if (caseFilter != nullptr && std::strcmp(kCases[i].name, caseFilter) != 0) { + continue; + } + int ret = RunCase(kCases[i], deviceId, stream); + if (ret != 0) { + std::fprintf(stderr, "[ERROR] case %s failed\n", kCases[i].name); + rc = 1; + break; + } + } + + if (stream != nullptr) + aclrtDestroyStream(stream); + aclrtResetDevice(deviceId); + aclFinalize(); + + return rc; +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandmin/tcolexpandmin.pto b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandmin/tcolexpandmin.pto new file mode 100644 index 000000000..8ebe69e53 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandmin/tcolexpandmin.pto @@ -0,0 +1,139 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file in the compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// TileLang ST kernels for pto.tcolexpandmin: elementwise minimum of src0 and tiled src1. +// Matches PTO-ISA testcase parameters. + +module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { + // Case 1: fp32_16_128_1_128 (float32, src0=(16,128), src1=(1,128), dst=(16,128)) + func.func @TCOLEXPANDMIN_fp16_10_64_1_64(%src0_ptr: !pto.ptr, %src1_ptr: !pto.ptr, %dst_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c10 = arith.constant 10 : index + %c64 = arith.constant 64 : index + %c640 = arith.constant 640 : index + + %src0_view = pto.make_tensor_view %src0_ptr, + shape = [%c1, %c1, %c1, %c10, %c64], + strides = [%c640, %c640, %c640, %c64, %c1] + : !pto.tensor_view<1x1x1x10x64xf16> + + %src1_view = pto.make_tensor_view %src1_ptr, + shape = [%c1, %c1, %c1, %c1, %c64], + strides = [%c64, %c64, %c64, %c64, %c1] + : !pto.tensor_view<1x1x1x1x64xf16> + + %dst_view = pto.make_tensor_view %dst_ptr, + shape = [%c1, %c1, %c1, %c10, %c64], + strides = [%c640, %c640, %c640, %c64, %c1] + : !pto.tensor_view<1x1x1x10x64xf16> + + %src0_part = pto.partition_view %src0_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c10, %c64] + : !pto.tensor_view<1x1x1x10x64xf16> -> !pto.partition_tensor_view<1x1x1x10x64xf16> + + %src1_part = pto.partition_view %src1_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c1, %c64] + : !pto.tensor_view<1x1x1x1x64xf16> -> !pto.partition_tensor_view<1x1x1x1x64xf16> + + %dst_part = pto.partition_view %dst_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c10, %c64] + : !pto.tensor_view<1x1x1x10x64xf16> -> !pto.partition_tensor_view<1x1x1x10x64xf16> + + %src0_tile = pto.alloc_tile + : !pto.tile_buf + + %src1_tile = pto.alloc_tile + : !pto.tile_buf + + %dst_tile = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%src0_part : !pto.partition_tensor_view<1x1x1x10x64xf16>) + outs(%src0_tile : !pto.tile_buf) + + pto.tload ins(%src1_part : !pto.partition_tensor_view<1x1x1x1x64xf16>) + outs(%src1_tile : !pto.tile_buf) + + pto.tcolexpandmin ins(%src0_tile, %src1_tile : !pto.tile_buf, + !pto.tile_buf) + outs(%dst_tile : !pto.tile_buf) + +pto.tstore ins(%dst_tile : !pto.tile_buf) + outs(%dst_part : !pto.partition_tensor_view<1x1x1x10x64xf16>) + return + } + + // Case 5: int32_16_32_1_32 (int32, src0=(16,32), src1=(1,32), dst=(16,32)) + + func.func @TCOLEXPANDMIN_int32_16_32_1_32(%src0_ptr: !pto.ptr, %src1_ptr: !pto.ptr, %dst_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c16 = arith.constant 16 : index + %c32 = arith.constant 32 : index + %c512 = arith.constant 512 : index + + %src0_view = pto.make_tensor_view %src0_ptr, + shape = [%c1, %c1, %c1, %c16, %c32], + strides = [%c512, %c512, %c512, %c32, %c1] + : !pto.tensor_view<1x1x1x16x32xi32> + + %src1_view = pto.make_tensor_view %src1_ptr, + shape = [%c1, %c1, %c1, %c1, %c32], + strides = [%c32, %c32, %c32, %c32, %c1] + : !pto.tensor_view<1x1x1x1x32xi32> + + %dst_view = pto.make_tensor_view %dst_ptr, + shape = [%c1, %c1, %c1, %c16, %c32], + strides = [%c512, %c512, %c512, %c32, %c1] + : !pto.tensor_view<1x1x1x16x32xi32> + + %src0_part = pto.partition_view %src0_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c16, %c32] + : !pto.tensor_view<1x1x1x16x32xi32> -> !pto.partition_tensor_view<1x1x1x16x32xi32> + + %src1_part = pto.partition_view %src1_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c1, %c32] + : !pto.tensor_view<1x1x1x1x32xi32> -> !pto.partition_tensor_view<1x1x1x1x32xi32> + + %dst_part = pto.partition_view %dst_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c16, %c32] + : !pto.tensor_view<1x1x1x16x32xi32> -> !pto.partition_tensor_view<1x1x1x16x32xi32> + + %src0_tile = pto.alloc_tile + : !pto.tile_buf + + %src1_tile = pto.alloc_tile + : !pto.tile_buf + + %dst_tile = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%src0_part : !pto.partition_tensor_view<1x1x1x16x32xi32>) + outs(%src0_tile : !pto.tile_buf) + + pto.tload ins(%src1_part : !pto.partition_tensor_view<1x1x1x1x32xi32>) + outs(%src1_tile : !pto.tile_buf) + + pto.tcolexpandmin ins(%src0_tile, %src1_tile : !pto.tile_buf, + !pto.tile_buf) + outs(%dst_tile : !pto.tile_buf) + + pto.tstore ins(%dst_tile : !pto.tile_buf) + outs(%dst_part : !pto.partition_tensor_view<1x1x1x16x32xi32>) + return + } + + // Case 6: int16_16_64_1_64 (int16, src0=(16,64), src1=(1,64), dst=(16,64)) +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandmul/CMakeLists.txt b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandmul/CMakeLists.txt new file mode 100644 index 000000000..577dc3da8 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandmul/CMakeLists.txt @@ -0,0 +1,16 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You can not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +pto_tilelang_vec_st(tcolexpandmul) diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandmul/cases.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandmul/cases.py new file mode 100644 index 000000000..97e8fa6c2 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandmul/cases.py @@ -0,0 +1,91 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You can not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +"""Single source of truth for tcolexpandmul ST test cases. + +TCOLEXPANDMUL: expand src1 then multiply with src0. + - src0_shape: (dst_row, dst_col) - already expanded + - src1_shape: (src1_row, src1_col) - to be expanded (usually src1_row=1) + - dst_shape: (dst_row, dst_col) +""" + +import numpy as np + +CASES = [ + { + "name": "fp32_32_32_1_32", + "dtype": np.float32, + "src0_shape": (32, 32), + "src1_shape": (1, 32), + "shape": (32, 32), + "valid_shape": (32, 32), + "eps": 1e-3, + }, + { + "name": "fp32_16_128_1_128", + "dtype": np.float32, + "src0_shape": (16, 128), + "src1_shape": (1, 128), + "shape": (16, 128), + "valid_shape": (16, 128), + "eps": 1e-3, + }, + { + "name": "fp16_4_256_1_256", + "dtype": np.float16, + "src0_shape": (4, 256), + "src1_shape": (1, 256), + "shape": (4, 256), + "valid_shape": (4, 256), + "eps": 1e-3, + }, + { + "name": "fp16_10_64_1_64", + "dtype": np.float16, + "src0_shape": (10, 64), + "src1_shape": (1, 64), + "shape": (10, 64), + "valid_shape": (10, 64), + "eps": 1e-3, + }, + { + "name": "int32_16_32_1_32", + "dtype": np.int32, + "src0_shape": (16, 32), + "src1_shape": (1, 32), + "shape": (16, 32), + "valid_shape": (16, 32), + "eps": 0, + }, + { + "name": "int16_16_64_1_64", + "dtype": np.int16, + "src0_shape": (16, 64), + "src1_shape": (1, 64), + "shape": (16, 64), + "valid_shape": (16, 64), + "eps": 0, + }, +] + +_SMOKE_CASE_NAMES = ['fp16_10_64_1_64', 'int32_16_32_1_32'] +_SMOKE_CASE_NAME_SET = set(_SMOKE_CASE_NAMES) +_missing = [name for name in _SMOKE_CASE_NAMES if name not in {case["name"] for case in CASES}] +if _missing: + raise RuntimeError("unknown smoke case(s): " + ", ".join(_missing)) +CASES = [case for case in CASES if case["name"] in _SMOKE_CASE_NAME_SET] diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandmul/compare.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandmul/compare.py new file mode 100644 index 000000000..b8ddb1131 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandmul/compare.py @@ -0,0 +1,56 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You can not use this file in the compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import os +import sys +import numpy as np + +from cases import CASES +from st_common import result_cmp, style_fail, style_pass, validate_cases + + +def main(): + validate_cases(CASES) + case_filter = sys.argv[1] if len(sys.argv) > 1 else None + + all_passed = True + for case in CASES: + if case_filter is not None and case["name"] != case_filter: + continue + + case_dir = case["name"] + shape = case["shape"] + vr, vc = case["valid_shape"] + + golden = np.fromfile(os.path.join(case_dir, "golden.bin"), dtype=case["dtype"]).reshape(shape) + output = np.fromfile(os.path.join(case_dir, "output.bin"), dtype=case["dtype"]).reshape(shape) + + ok = result_cmp(golden[:vr, :vc], output[:vr, :vc], case["eps"]) + if ok: + print(style_pass(f"[INFO] {case['name']}: compare passed")) + else: + print(style_fail(f"[ERROR] {case['name']}: compare failed")) + all_passed = False + + if not all_passed: + sys.exit(2) + print(style_pass("[INFO] all cases passed")) + + +if __name__ == "__main__": + main() diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandmul/gen_data.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandmul/gen_data.py new file mode 100644 index 000000000..a212c0aac --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandmul/gen_data.py @@ -0,0 +1,42 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You can not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import numpy as np +from cases import CASES +from st_common import validate_cases, setup_case_rng, save_case_data + +validate_cases(CASES) + +for case in CASES: + setup_case_rng(case) + + dtype = case["dtype"] + src0_shape = case["src0_shape"] + dst_shape = case["shape"] + src1_shape = case["src1_shape"] + valid_shape = case["valid_shape"] + + src0 = np.random.randint(1, 10, size=src0_shape).astype(dtype) + src1 = np.random.randint(1, 10, size=src1_shape).astype(dtype) + + dst_row, dst_col = dst_shape + reps = dst_row + golden = src0 * np.tile(src1, (reps, 1))[:, :dst_col] + + save_case_data(case["name"], {"input0": src0, "input1": src1, "golden": golden}) + print(f"[INFO] gen_data: {case['name']} src0={src0_shape} src1={src1_shape} dst={dst_shape} dtype={dtype.__name__}") diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandmul/launch.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandmul/launch.cpp new file mode 100644 index 000000000..2d7a1cff7 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandmul/launch.cpp @@ -0,0 +1,28 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +#include + +#ifndef AICORE +#define AICORE [aicore] +#endif + +// Case 1: fp32_16_128_1_128 + +extern "C" __global__ AICORE void TCOLEXPANDMUL_fp16_10_64_1_64(__gm__ uint16_t *src0, __gm__ uint16_t *src1, __gm__ uint16_t *dst); +extern "C" __global__ AICORE void TCOLEXPANDMUL_int32_16_32_1_32(__gm__ int32_t *src0, __gm__ int32_t *src1, __gm__ int32_t *dst); + +void LaunchTCOLEXPANDMUL_int32_16_32_1_32(int32_t *src0, int32_t *src1, int32_t *dst, void *stream) { + TCOLEXPANDMUL_int32_16_32_1_32<<<1, nullptr, stream>>>((__gm__ int32_t *)src0, (__gm__ int32_t *)src1, (__gm__ int32_t *)dst); +} + + + +void LaunchTCOLEXPANDMUL_fp16_10_64_1_64(uint16_t *src0, uint16_t *src1, uint16_t *dst, void *stream) { + TCOLEXPANDMUL_fp16_10_64_1_64<<<1, nullptr, stream>>>((__gm__ uint16_t *)src0, (__gm__ uint16_t *)src1, (__gm__ uint16_t *)dst); +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandmul/main.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandmul/main.cpp new file mode 100644 index 000000000..7c92940bd --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandmul/main.cpp @@ -0,0 +1,159 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You can not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// Host driver for TileLang tcolexpandmul ST +// TCOLEXPANDMUL: expand src1 then multiply with src0 + +#include "acl/acl.h" +#include "test_common.h" +#include +#include +#include +#include +#include +#include + +using namespace PtoTestCommon; + +// Kernel launch wrappers (defined in launch.cpp) +void LaunchTCOLEXPANDMUL_fp32_32_32_1_32(float *src0, float *src1, float *dst, void *stream); +void LaunchTCOLEXPANDMUL_fp16_10_64_1_64(uint16_t *src0, uint16_t *src1, uint16_t *dst, void *stream); +void LaunchTCOLEXPANDMUL_int32_16_32_1_32(int32_t *src0, int32_t *src1, int32_t *dst, void *stream); +void LaunchTCOLEXPANDMUL_int16_16_64_1_64(int16_t *src0, int16_t *src1, int16_t *dst, void *stream); + +using LaunchFn = void (*)(void *, void *, void *, void *); + +struct TestCase { + const char *name; + LaunchFn launch; + size_t src0Rows; + size_t src0Cols; + size_t src1Rows; + size_t src1Cols; + size_t dstRows; + size_t dstCols; + size_t validRows; + size_t validCols; + size_t elemSize; +}; + +static const TestCase kCases[] = { +{"fp16_10_64_1_64", (LaunchFn)LaunchTCOLEXPANDMUL_fp16_10_64_1_64, 10, 64, 1, 64, 10, 64, 10, 64, sizeof(uint16_t)}, +{"int32_16_32_1_32", (LaunchFn)LaunchTCOLEXPANDMUL_int32_16_32_1_32, 16, 32, 1, 32, 16, 32, 16, 32, sizeof(int32_t)}, +}; +static constexpr size_t kNumCases = sizeof(kCases) / sizeof(kCases[0]); + +static int RunCase(const TestCase &tc, int deviceId, aclrtStream stream) { + int rc = 0; + const size_t src0ElemCount = tc.src0Rows * tc.src0Cols; + const size_t src1ElemCount = tc.src1Rows * tc.src1Cols; + const size_t dstElemCount = tc.dstRows * tc.dstCols; + const size_t src0FileSize = src0ElemCount * tc.elemSize; + const size_t src1FileSize = src1ElemCount * tc.elemSize; + const size_t dstFileSize = dstElemCount * tc.elemSize; + + std::printf("[INFO] === case: %s (src0=%zux%zu, src1=%zux%zu -> dst=%zux%zu) ===\n", + tc.name, tc.src0Rows, tc.src0Cols, tc.src1Rows, tc.src1Cols, tc.dstRows, tc.dstCols); + + std::string caseDir = std::string("./") + tc.name; + size_t actualSrc0FileSize = src0FileSize; + size_t actualSrc1FileSize = src1FileSize; + + void *src0Host = nullptr, *src1Host = nullptr, *dstHost = nullptr; + void *src0Device = nullptr, *src1Device = nullptr, *dstDevice = nullptr; + + aclrtMallocHost((void **)(&src0Host), src0FileSize); + aclrtMallocHost((void **)(&src1Host), src1FileSize); + aclrtMallocHost((void **)(&dstHost), dstFileSize); + + aclrtMalloc((void **)&src0Device, src0FileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc((void **)&src1Device, src1FileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc((void **)&dstDevice, dstFileSize, ACL_MEM_MALLOC_HUGE_FIRST); + + if (!ReadFile((caseDir + "/input0.bin").c_str(), actualSrc0FileSize, src0Host, src0FileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input0.bin\n", caseDir.c_str()); + rc = 1; + } + if (rc == 0 && !ReadFile((caseDir + "/input1.bin").c_str(), actualSrc1FileSize, src1Host, src1FileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input1.bin\n", caseDir.c_str()); + rc = 1; + } + + if (rc == 0) { + aclrtMemcpy(src0Device, src0FileSize, src0Host, src0FileSize, ACL_MEMCPY_HOST_TO_DEVICE); + aclrtMemcpy(src1Device, src1FileSize, src1Host, src1FileSize, ACL_MEMCPY_HOST_TO_DEVICE); + + tc.launch(src0Device, src1Device, dstDevice, stream); + + aclrtSynchronizeStream(stream); + aclrtMemcpy(dstHost, dstFileSize, dstDevice, dstFileSize, ACL_MEMCPY_DEVICE_TO_HOST); + } + + if (rc == 0 && !WriteFile((caseDir + "/output.bin").c_str(), dstHost, dstFileSize)) { + std::fprintf(stderr, "[ERROR] failed to write %s/output.bin\n", caseDir.c_str()); + rc = 1; + } + + if (src0Device != nullptr) + aclrtFree(src0Device); + if (src1Device != nullptr) + aclrtFree(src1Device); + if (dstDevice != nullptr) + aclrtFree(dstDevice); + if (src0Host != nullptr) + aclrtFreeHost(src0Host); + if (src1Host != nullptr) + aclrtFreeHost(src1Host); + if (dstHost != nullptr) + aclrtFreeHost(dstHost); + + if (rc == 0) + std::printf("[INFO] case %s done\n", tc.name); + return rc; +} + +int main(int argc, char *argv[]) { + const char *caseFilter = (argc > 1) ? argv[1] : nullptr; + + int rc = 0; + int deviceId = 0; + aclrtStream stream = nullptr; + + aclInit(nullptr); + if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { + deviceId = std::atoi(envDevice); + } + aclrtSetDevice(deviceId); + aclrtCreateStream(&stream); + + for (size_t i = 0; i < kNumCases; ++i) { + if (caseFilter != nullptr && std::strcmp(kCases[i].name, caseFilter) != 0) { + continue; + } + int ret = RunCase(kCases[i], deviceId, stream); + if (ret != 0) { + std::fprintf(stderr, "[ERROR] case %s failed\n", kCases[i].name); + rc = 1; + break; + } + } + + if (stream != nullptr) + aclrtDestroyStream(stream); + aclrtResetDevice(deviceId); + aclFinalize(); + + return rc; +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandmul/tcolexpandmul.pto b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandmul/tcolexpandmul.pto new file mode 100644 index 000000000..38ece44ff --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandmul/tcolexpandmul.pto @@ -0,0 +1,138 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You can not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// TileLang ST kernels for pto.tcolexpandmul: expand src1 then multiply with src0. + +module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { + // Case 1: fp32_16_128_1_128 (float32, src0=(16,128), src1=(1,128), dst=(16,128)) + func.func @TCOLEXPANDMUL_fp16_10_64_1_64(%src0_ptr: !pto.ptr, %src1_ptr: !pto.ptr, %dst_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c10 = arith.constant 10 : index + %c64 = arith.constant 64 : index + %c640 = arith.constant 640 : index + + %src0_view = pto.make_tensor_view %src0_ptr, + shape = [%c1, %c1, %c1, %c10, %c64], + strides = [%c640, %c640, %c640, %c64, %c1] + : !pto.tensor_view<1x1x1x10x64xf16> + + %src1_view = pto.make_tensor_view %src1_ptr, + shape = [%c1, %c1, %c1, %c1, %c64], + strides = [%c64, %c64, %c64, %c64, %c1] + : !pto.tensor_view<1x1x1x1x64xf16> + + %dst_view = pto.make_tensor_view %dst_ptr, + shape = [%c1, %c1, %c1, %c10, %c64], + strides = [%c640, %c640, %c640, %c64, %c1] + : !pto.tensor_view<1x1x1x10x64xf16> + + %src0_part = pto.partition_view %src0_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c10, %c64] + : !pto.tensor_view<1x1x1x10x64xf16> -> !pto.partition_tensor_view<1x1x1x10x64xf16> + + %src1_part = pto.partition_view %src1_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c1, %c64] + : !pto.tensor_view<1x1x1x1x64xf16> -> !pto.partition_tensor_view<1x1x1x1x64xf16> + + %dst_part = pto.partition_view %dst_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c10, %c64] + : !pto.tensor_view<1x1x1x10x64xf16> -> !pto.partition_tensor_view<1x1x1x10x64xf16> + + %src0 = pto.alloc_tile + : !pto.tile_buf + + %src1 = pto.alloc_tile + : !pto.tile_buf + + %dst = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%src0_part : !pto.partition_tensor_view<1x1x1x10x64xf16>) + outs(%src0 : !pto.tile_buf) + + pto.tload ins(%src1_part : !pto.partition_tensor_view<1x1x1x1x64xf16>) + outs(%src1 : !pto.tile_buf) + + pto.tcolexpandmul ins(%src0, %src1 : !pto.tile_buf, + !pto.tile_buf) + outs(%dst : !pto.tile_buf) + +pto.tstore ins(%dst : !pto.tile_buf) + outs(%dst_part : !pto.partition_tensor_view<1x1x1x10x64xf16>) + return + } + + // Case 5: int32_16_32_1_32 (int32, src0=(16,32), src1=(1,32), dst=(16,32)) + + func.func @TCOLEXPANDMUL_int32_16_32_1_32(%src0_ptr: !pto.ptr, %src1_ptr: !pto.ptr, %dst_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c16 = arith.constant 16 : index + %c32 = arith.constant 32 : index + %c512 = arith.constant 512 : index + + %src0_view = pto.make_tensor_view %src0_ptr, + shape = [%c1, %c1, %c1, %c16, %c32], + strides = [%c512, %c512, %c512, %c32, %c1] + : !pto.tensor_view<1x1x1x16x32xi32> + + %src1_view = pto.make_tensor_view %src1_ptr, + shape = [%c1, %c1, %c1, %c1, %c32], + strides = [%c32, %c32, %c32, %c32, %c1] + : !pto.tensor_view<1x1x1x1x32xi32> + + %dst_view = pto.make_tensor_view %dst_ptr, + shape = [%c1, %c1, %c1, %c16, %c32], + strides = [%c512, %c512, %c512, %c32, %c1] + : !pto.tensor_view<1x1x1x16x32xi32> + + %src0_part = pto.partition_view %src0_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c16, %c32] + : !pto.tensor_view<1x1x1x16x32xi32> -> !pto.partition_tensor_view<1x1x1x16x32xi32> + + %src1_part = pto.partition_view %src1_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c1, %c32] + : !pto.tensor_view<1x1x1x1x32xi32> -> !pto.partition_tensor_view<1x1x1x1x32xi32> + + %dst_part = pto.partition_view %dst_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c16, %c32] + : !pto.tensor_view<1x1x1x16x32xi32> -> !pto.partition_tensor_view<1x1x1x16x32xi32> + + %src0 = pto.alloc_tile + : !pto.tile_buf + + %src1 = pto.alloc_tile + : !pto.tile_buf + + %dst = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%src0_part : !pto.partition_tensor_view<1x1x1x16x32xi32>) + outs(%src0 : !pto.tile_buf) + + pto.tload ins(%src1_part : !pto.partition_tensor_view<1x1x1x1x32xi32>) + outs(%src1 : !pto.tile_buf) + + pto.tcolexpandmul ins(%src0, %src1 : !pto.tile_buf, + !pto.tile_buf) + outs(%dst : !pto.tile_buf) + + pto.tstore ins(%dst : !pto.tile_buf) + outs(%dst_part : !pto.partition_tensor_view<1x1x1x16x32xi32>) + return + } + + // Case 6: int16_16_64_1_64 (int16, src0=(16,64), src1=(1,64), dst=(16,64)) +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandsub/CMakeLists.txt b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandsub/CMakeLists.txt new file mode 100644 index 000000000..e5a57ce90 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandsub/CMakeLists.txt @@ -0,0 +1,16 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file in the compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +pto_tilelang_vec_st(tcolexpandsub) diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandsub/cases.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandsub/cases.py new file mode 100644 index 000000000..e6010cb37 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandsub/cases.py @@ -0,0 +1,91 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +"""Single source of truth for tcolexpandsub ST test cases. +Matches PTO-ISA testcase definitions in /home/zhoushaofan/code/pto-isa/tests/npu/a5/src/st/testcase/tcolexpandsub/ + +TCOLEXPANDSUB: subtract src0 by expanded src1 (broadcast src1 first row). + - src0_shape: (src0_row, cols) - first input tile + - src1_shape: (src1_row, cols) - second input tile (only first row used for broadcast) + - dst_shape: (dst_row, cols) - result output + - shape: (dst_row, cols) - alias of dst_shape, for compare.py compatibility + - valid_shape: (valid_row, valid_col) - effective computation region + +Golden: src0 - np.tile(src1, (reps, 1))[:, :dst_col] # expand then subtract + +Case naming: {dtype}_{src0_row}_{cols}_{src1_row}_{dst_col} +""" + +import numpy as np + +CASES = [ + { + "name": "fp32_6_128_1_128", + "dtype": np.float32, + "src0_shape": (6, 128), + "src1_shape": (1, 128), + "shape": (6, 128), + "valid_shape": (6, 128), + "eps": 1e-6, + }, + { + "name": "fp32_18_32_1_32", + "dtype": np.float32, + "src0_shape": (18, 32), + "src1_shape": (1, 32), + "shape": (18, 32), + "valid_shape": (18, 32), + "eps": 1e-6, + }, + { + "name": "fp16_10_256_1_256", + "dtype": np.float16, + "src0_shape": (10, 256), + "src1_shape": (1, 256), + "shape": (10, 256), + "valid_shape": (10, 256), + "eps": 1e-3, + }, + { + "name": "fp16_12_64_1_64", + "dtype": np.float16, + "src0_shape": (12, 64), + "src1_shape": (1, 64), + "shape": (12, 64), + "valid_shape": (12, 64), + "eps": 1e-3, + }, + { + "name": "int32_16_32_1_32", + "dtype": np.int32, + "src0_shape": (16, 32), + "src1_shape": (1, 32), + "shape": (16, 32), + "valid_shape": (16, 32), + "eps": 0, + }, + { + "name": "int16_16_64_1_64", + "dtype": np.int16, + "src0_shape": (16, 64), + "src1_shape": (1, 64), + "shape": (16, 64), + "valid_shape": (16, 64), + "eps": 0, + }, +] + +_SMOKE_CASE_NAMES = ['fp32_18_32_1_32', 'int32_16_32_1_32'] +_SMOKE_CASE_NAME_SET = set(_SMOKE_CASE_NAMES) +_missing = [name for name in _SMOKE_CASE_NAMES if name not in {case["name"] for case in CASES}] +if _missing: + raise RuntimeError("unknown smoke case(s): " + ", ".join(_missing)) +CASES = [case for case in CASES if case["name"] in _SMOKE_CASE_NAME_SET] diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandsub/compare.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandsub/compare.py new file mode 100644 index 000000000..b8ddb1131 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandsub/compare.py @@ -0,0 +1,56 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You can not use this file in the compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import os +import sys +import numpy as np + +from cases import CASES +from st_common import result_cmp, style_fail, style_pass, validate_cases + + +def main(): + validate_cases(CASES) + case_filter = sys.argv[1] if len(sys.argv) > 1 else None + + all_passed = True + for case in CASES: + if case_filter is not None and case["name"] != case_filter: + continue + + case_dir = case["name"] + shape = case["shape"] + vr, vc = case["valid_shape"] + + golden = np.fromfile(os.path.join(case_dir, "golden.bin"), dtype=case["dtype"]).reshape(shape) + output = np.fromfile(os.path.join(case_dir, "output.bin"), dtype=case["dtype"]).reshape(shape) + + ok = result_cmp(golden[:vr, :vc], output[:vr, :vc], case["eps"]) + if ok: + print(style_pass(f"[INFO] {case['name']}: compare passed")) + else: + print(style_fail(f"[ERROR] {case['name']}: compare failed")) + all_passed = False + + if not all_passed: + sys.exit(2) + print(style_pass("[INFO] all cases passed")) + + +if __name__ == "__main__": + main() diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandsub/gen_data.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandsub/gen_data.py new file mode 100644 index 000000000..a7960c311 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandsub/gen_data.py @@ -0,0 +1,35 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import numpy as np +from cases import CASES +from st_common import validate_cases, setup_case_rng, save_case_data + +validate_cases(CASES) + +for case in CASES: + setup_case_rng(case) + + dtype = case["dtype"] + src0_shape = case["src0_shape"] + dst_shape = case["shape"] + src1_shape = case["src1_shape"] + valid_shape = case["valid_shape"] + + src0 = np.random.randint(1, 10, size=src0_shape).astype(dtype) + src1 = np.random.randint(1, 10, size=src1_shape).astype(dtype) + + valid_row, valid_col = valid_shape + reps = valid_row + golden = src0 - np.tile(src1, (reps, 1))[:, :valid_col] + + save_case_data(case["name"], {"input0": src0, "input1": src1, "golden": golden}) + print(f"[INFO] gen_data: {case['name']} src0={src0_shape} src1={src1_shape} dst={dst_shape} valid={valid_shape} dtype={dtype.__name__}") diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandsub/launch.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandsub/launch.cpp new file mode 100644 index 000000000..d55d06881 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandsub/launch.cpp @@ -0,0 +1,28 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +#include + +#ifndef AICORE +#define AICORE [aicore] +#endif + +// Case 1: fp32_6_128_1_128 + +extern "C" __global__ AICORE void TCOLEXPANDSUB_fp32_18_32_1_32(__gm__ float *src0, __gm__ float *src1, __gm__ float *dst); +extern "C" __global__ AICORE void TCOLEXPANDSUB_int32_16_32_1_32(__gm__ int32_t *src0, __gm__ int32_t *src1, __gm__ int32_t *dst); + +void LaunchTCOLEXPANDSUB_fp32_18_32_1_32(float *src0, float *src1, float *dst, void *stream) { + TCOLEXPANDSUB_fp32_18_32_1_32<<<1, nullptr, stream>>>((__gm__ float *)src0, (__gm__ float *)src1, (__gm__ float *)dst); +} + + + +void LaunchTCOLEXPANDSUB_int32_16_32_1_32(int32_t *src0, int32_t *src1, int32_t *dst, void *stream) { + TCOLEXPANDSUB_int32_16_32_1_32<<<1, nullptr, stream>>>((__gm__ int32_t *)src0, (__gm__ int32_t *)src1, (__gm__ int32_t *)dst); +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandsub/main.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandsub/main.cpp new file mode 100644 index 000000000..40c15d7b6 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandsub/main.cpp @@ -0,0 +1,161 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file in the compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// Host driver for TileLang tcolexpandsub ST +// Test cases match PTO-ISA: /home/zhoushaofan/code/pto-isa/tests/npu/a5/src/st/testcase/tcolexpandsub/ +// TCOLEXPANDSUB: subtract src0 by expanded src1 (broadcast src1 first row) + +#include "acl/acl.h" +#include "test_common.h" +#include +#include +#include +#include +#include +#include + +using namespace PtoTestCommon; + +// Kernel launch wrappers (defined in launch.cpp) +void LaunchTCOLEXPANDSUB_fp32_18_32_1_32(float *src0, float *src1, float *dst, void *stream); +void LaunchTCOLEXPANDSUB_fp16_12_64_1_64(uint16_t *src0, uint16_t *src1, uint16_t *dst, void *stream); +void LaunchTCOLEXPANDSUB_int32_16_32_1_32(int32_t *src0, int32_t *src1, int32_t *dst, void *stream); +void LaunchTCOLEXPANDSUB_int16_16_64_1_64(int16_t *src0, int16_t *src1, int16_t *dst, void *stream); + +using LaunchFn = void (*)(void *, void *, void *, void *); + +struct TestCase { + const char *name; + LaunchFn launch; + size_t src0Rows; + size_t src0Cols; + size_t src1Rows; + size_t src1Cols; + size_t dstRows; + size_t dstCols; + size_t validRows; + size_t validCols; + size_t elemSize; +}; + +static const TestCase kCases[] = { +{"fp32_18_32_1_32", (LaunchFn)LaunchTCOLEXPANDSUB_fp32_18_32_1_32, 18, 32, 1, 32,18, 32,18, 32, sizeof(float)}, +{"int32_16_32_1_32", (LaunchFn)LaunchTCOLEXPANDSUB_int32_16_32_1_32, 16, 32, 1, 32, 16, 32, 16, 32, sizeof(int32_t)}, +}; +static constexpr size_t kNumCases = sizeof(kCases) / sizeof(kCases[0]); + +static int RunCase(const TestCase &tc, int deviceId, aclrtStream stream) { + int rc = 0; + const size_t src0ElemCount = tc.src0Rows * tc.src0Cols; + const size_t src1ElemCount = tc.src1Rows * tc.src1Cols; + const size_t dstElemCount = tc.dstRows * tc.dstCols; + const size_t src0FileSize = src0ElemCount * tc.elemSize; + const size_t src1FileSize = src1ElemCount * tc.elemSize; + const size_t dstFileSize = dstElemCount * tc.elemSize; + + std::printf("[INFO] === case: %s (src0=%zux%zu, src1=%zux%zu -> dst=%zux%zu, valid=%zux%zu) ===\n", + tc.name, tc.src0Rows, tc.src0Cols, tc.src1Rows, tc.src1Cols, tc.dstRows, tc.dstCols, tc.validRows, tc.validCols); + + std::string caseDir = std::string("./") + tc.name; + size_t actualSrc0FileSize = src0FileSize; + size_t actualSrc1FileSize = src1FileSize; + + void *src0Host = nullptr, *src1Host = nullptr, *dstHost = nullptr; + void *src0Device = nullptr, *src1Device = nullptr, *dstDevice = nullptr; + + aclrtMallocHost((void **)(&src0Host), src0FileSize); + aclrtMallocHost((void **)(&src1Host), src1FileSize); + aclrtMallocHost((void **)(&dstHost), dstFileSize); + + aclrtMalloc((void **)&src0Device, src0FileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc((void **)&src1Device, src1FileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc((void **)&dstDevice, dstFileSize, ACL_MEM_MALLOC_HUGE_FIRST); + + if (!ReadFile((caseDir + "/input0.bin").c_str(), actualSrc0FileSize, src0Host, src0FileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input0.bin\n", caseDir.c_str()); + rc = 1; + } + + if (rc == 0 && !ReadFile((caseDir + "/input1.bin").c_str(), actualSrc1FileSize, src1Host, src1FileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input1.bin\n", caseDir.c_str()); + rc = 1; + } + + if (rc == 0) { + aclrtMemcpy(src0Device, src0FileSize, src0Host, src0FileSize, ACL_MEMCPY_HOST_TO_DEVICE); + aclrtMemcpy(src1Device, src1FileSize, src1Host, src1FileSize, ACL_MEMCPY_HOST_TO_DEVICE); + + tc.launch(src0Device, src1Device, dstDevice, stream); + + aclrtSynchronizeStream(stream); + aclrtMemcpy(dstHost, dstFileSize, dstDevice, dstFileSize, ACL_MEMCPY_DEVICE_TO_HOST); + } + + if (rc == 0 && !WriteFile((caseDir + "/output.bin").c_str(), dstHost, dstFileSize)) { + std::fprintf(stderr, "[ERROR] failed to write %s/output.bin\n", caseDir.c_str()); + rc = 1; + } + + if (src0Device != nullptr) + aclrtFree(src0Device); + if (src1Device != nullptr) + aclrtFree(src1Device); + if (dstDevice != nullptr) + aclrtFree(dstDevice); + if (src0Host != nullptr) + aclrtFreeHost(src0Host); + if (src1Host != nullptr) + aclrtFreeHost(src1Host); + if (dstHost != nullptr) + aclrtFreeHost(dstHost); + + if (rc == 0) + std::printf("[INFO] case %s done\n", tc.name); + return rc; +} + +int main(int argc, char *argv[]) { + const char *caseFilter = (argc > 1) ? argv[1] : nullptr; + + int rc = 0; + int deviceId = 0; + aclrtStream stream = nullptr; + + aclInit(nullptr); + if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { + deviceId = std::atoi(envDevice); + } + aclrtSetDevice(deviceId); + aclrtCreateStream(&stream); + + for (size_t i = 0; i < kNumCases; ++i) { + if (caseFilter != nullptr && std::strcmp(kCases[i].name, caseFilter) != 0) { + continue; + } + int ret = RunCase(kCases[i], deviceId, stream); + if (ret != 0) { + std::fprintf(stderr, "[ERROR] case %s failed\n", kCases[i].name); + rc = 1; + break; + } + } + + if (stream != nullptr) + aclrtDestroyStream(stream); + aclrtResetDevice(deviceId); + aclFinalize(); + + return rc; +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandsub/tcolexpandsub.pto b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandsub/tcolexpandsub.pto new file mode 100644 index 000000000..76c706b01 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolexpandsub/tcolexpandsub.pto @@ -0,0 +1,140 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file in the compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// TileLang ST kernels for pto.tcolexpandsub: subtract src0 by expanded src1. +// Matches PTO-ISA testcase parameters. +// Golden: src0 - np.tile(src1, (reps, 1))[:, :dst_col] + +module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { + // Case 1: fp32_6_128_1_128 (float32, src0=(6,128), src1=(1,128), dst=(6,128)) + func.func @TCOLEXPANDSUB_fp32_18_32_1_32(%src0_ptr: !pto.ptr, %src1_ptr: !pto.ptr, %dst_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c18 = arith.constant 18 : index + %c32 = arith.constant 32 : index + %c576 = arith.constant 576 : index + + %src0_view = pto.make_tensor_view %src0_ptr, + shape = [%c1, %c1, %c1, %c18, %c32], + strides = [%c576, %c576, %c576, %c32, %c1] + : !pto.tensor_view<1x1x1x18x32xf32> + + %src1_view = pto.make_tensor_view %src1_ptr, + shape = [%c1, %c1, %c1, %c1, %c32], + strides = [%c32, %c32, %c32, %c32, %c1] + : !pto.tensor_view<1x1x1x1x32xf32> + + %dst_view = pto.make_tensor_view %dst_ptr, + shape = [%c1, %c1, %c1, %c18, %c32], + strides = [%c576, %c576, %c576, %c32, %c1] + : !pto.tensor_view<1x1x1x18x32xf32> + + %src0_part = pto.partition_view %src0_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c18, %c32] + : !pto.tensor_view<1x1x1x18x32xf32> -> !pto.partition_tensor_view<1x1x1x18x32xf32> + + %src1_part = pto.partition_view %src1_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c1, %c32] + : !pto.tensor_view<1x1x1x1x32xf32> -> !pto.partition_tensor_view<1x1x1x1x32xf32> + + %dst_part = pto.partition_view %dst_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c18, %c32] + : !pto.tensor_view<1x1x1x18x32xf32> -> !pto.partition_tensor_view<1x1x1x18x32xf32> + + %src0 = pto.alloc_tile + : !pto.tile_buf + + %src1 = pto.alloc_tile + : !pto.tile_buf + + %dst = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%src0_part : !pto.partition_tensor_view<1x1x1x18x32xf32>) + outs(%src0 : !pto.tile_buf) + + pto.tload ins(%src1_part : !pto.partition_tensor_view<1x1x1x1x32xf32>) + outs(%src1 : !pto.tile_buf) + + pto.tcolexpandsub ins(%src0, %src1 : !pto.tile_buf, + !pto.tile_buf) + outs(%dst : !pto.tile_buf) + + pto.tstore ins(%dst : !pto.tile_buf) + outs(%dst_part : !pto.partition_tensor_view<1x1x1x18x32xf32>) + return + } + + // Case 3: fp16_10_256_1_256 (float16, src0=(10,256), src1=(1,256), dst=(10,256)) + + func.func @TCOLEXPANDSUB_int32_16_32_1_32(%src0_ptr: !pto.ptr, %src1_ptr: !pto.ptr, %dst_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c16 = arith.constant 16 : index + %c32 = arith.constant 32 : index + %c512 = arith.constant 512 : index + + %src0_view = pto.make_tensor_view %src0_ptr, + shape = [%c1, %c1, %c1, %c16, %c32], + strides = [%c512, %c512, %c512, %c32, %c1] + : !pto.tensor_view<1x1x1x16x32xi32> + + %src1_view = pto.make_tensor_view %src1_ptr, + shape = [%c1, %c1, %c1, %c1, %c32], + strides = [%c32, %c32, %c32, %c32, %c1] + : !pto.tensor_view<1x1x1x1x32xi32> + + %dst_view = pto.make_tensor_view %dst_ptr, + shape = [%c1, %c1, %c1, %c16, %c32], + strides = [%c512, %c512, %c512, %c32, %c1] + : !pto.tensor_view<1x1x1x16x32xi32> + + %src0_part = pto.partition_view %src0_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c16, %c32] + : !pto.tensor_view<1x1x1x16x32xi32> -> !pto.partition_tensor_view<1x1x1x16x32xi32> + + %src1_part = pto.partition_view %src1_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c1, %c32] + : !pto.tensor_view<1x1x1x1x32xi32> -> !pto.partition_tensor_view<1x1x1x1x32xi32> + + %dst_part = pto.partition_view %dst_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c16, %c32] + : !pto.tensor_view<1x1x1x16x32xi32> -> !pto.partition_tensor_view<1x1x1x16x32xi32> + + %src0 = pto.alloc_tile + : !pto.tile_buf + + %src1 = pto.alloc_tile + : !pto.tile_buf + + %dst = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%src0_part : !pto.partition_tensor_view<1x1x1x16x32xi32>) + outs(%src0 : !pto.tile_buf) + + pto.tload ins(%src1_part : !pto.partition_tensor_view<1x1x1x1x32xi32>) + outs(%src1 : !pto.tile_buf) + + pto.tcolexpandsub ins(%src0, %src1 : !pto.tile_buf, + !pto.tile_buf) + outs(%dst : !pto.tile_buf) + + pto.tstore ins(%dst : !pto.tile_buf) + outs(%dst_part : !pto.partition_tensor_view<1x1x1x16x32xi32>) + return + } + + // Case 6: int16_16_64_1_64 (int16, src0=(16,64), src1=(1,64), dst=(16,64)) +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolmax/CMakeLists.txt b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolmax/CMakeLists.txt new file mode 100644 index 000000000..0da26a8f7 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolmax/CMakeLists.txt @@ -0,0 +1,9 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +pto_tilelang_vec_st(tcolmax) diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolmax/cases.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolmax/cases.py new file mode 100644 index 000000000..c7fa27385 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolmax/cases.py @@ -0,0 +1,252 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +"""Single source of truth for tcolmax ST test cases. + +Each case defines: + - name: case identifier, used as subdirectory name and by main.cpp kCases[]. + - dtype: numpy dtype (e.g. np.float32). + - shape: (rows, cols) — allocated tile dimensions for input. + - valid_shape: (valid_rows, valid_cols) — effective computation region for input. + - dst_shape: (1, cols) — allocated tile dimensions for output. + - dst_valid_shape: (1, valid_cols) — effective computation region for output. + - eps: tolerance for numpy.allclose (atol and rtol). + +gen_data.py and compare.py both import this list to avoid redundant definitions. +""" + +import numpy as np + +CASES = [ + { + "name": "f32_1x256", + "dtype": np.float32, + "shape": (1, 256), + "valid_shape": (1, 255), + "dst_shape": (1, 256), + "dst_valid_shape": (1, 255), + "eps": 1e-6, + }, + { + "name": "f32_16x128", + "dtype": np.float32, + "shape": (16, 128), + "valid_shape": (16, 127), + "dst_shape": (1, 128), + "dst_valid_shape": (1, 127), + "eps": 1e-6, + }, + { + "name": "f32_16x256", + "dtype": np.float32, + "shape": (16, 256), + "valid_shape": (15, 255), + "dst_shape": (1, 256), + "dst_valid_shape": (1, 255), + "eps": 1e-6, + }, + { + "name": "f16_1x256", + "dtype": np.float16, + "shape": (1, 256), + "valid_shape": (1, 255), + "dst_shape": (1, 256), + "dst_valid_shape": (1, 255), + "eps": 1e-3, + }, + { + "name": "f16_16x128", + "dtype": np.float16, + "shape": (16, 128), + "valid_shape": (16, 127), + "dst_shape": (1, 128), + "dst_valid_shape": (1, 127), + "eps": 1e-3, + }, + { + "name": "f16_16x256", + "dtype": np.float16, + "shape": (16, 256), + "valid_shape": (15, 255), + "dst_shape": (1, 256), + "dst_valid_shape": (1, 255), + "eps": 1e-3, + }, + { + "name": "i8_1x256", + "dtype": np.int8, + "shape": (1, 256), + "valid_shape": (1, 255), + "dst_shape": (1, 256), + "dst_valid_shape": (1, 255), + "eps": 0, + }, + { + "name": "i8_16x128", + "dtype": np.int8, + "shape": (16, 128), + "valid_shape": (16, 127), + "dst_shape": (1, 128), + "dst_valid_shape": (1, 127), + "eps": 0, + }, + { + "name": "i8_16x256", + "dtype": np.int8, + "shape": (16, 256), + "valid_shape": (15, 255), + "dst_shape": (1, 256), + "dst_valid_shape": (1, 255), + "eps": 0, + }, + { + "name": "i16_1x256", + "dtype": np.int16, + "shape": (1, 256), + "valid_shape": (1, 255), + "dst_shape": (1, 256), + "dst_valid_shape": (1, 255), + "eps": 0, + }, + { + "name": "i16_16x128", + "dtype": np.int16, + "shape": (16, 128), + "valid_shape": (16, 127), + "dst_shape": (1, 128), + "dst_valid_shape": (1, 127), + "eps": 0, + }, + { + "name": "i16_16x256", + "dtype": np.int16, + "shape": (16, 256), + "valid_shape": (15, 255), + "dst_shape": (1, 256), + "dst_valid_shape": (1, 255), + "eps": 0, + }, + { + "name": "i32_1x256", + "dtype": np.int32, + "shape": (1, 256), + "valid_shape": (1, 255), + "dst_shape": (1, 256), + "dst_valid_shape": (1, 255), + "eps": 0, + }, + { + "name": "i32_16x128", + "dtype": np.int32, + "shape": (16, 128), + "valid_shape": (16, 127), + "dst_shape": (1, 128), + "dst_valid_shape": (1, 127), + "eps": 0, + }, + { + "name": "i32_16x256", + "dtype": np.int32, + "shape": (16, 256), + "valid_shape": (15, 255), + "dst_shape": (1, 256), + "dst_valid_shape": (1, 255), + "eps": 0, + }, + { + "name": "ui8_1x256", + "dtype": np.uint8, + "shape": (1, 256), + "valid_shape": (1, 255), + "dst_shape": (1, 256), + "dst_valid_shape": (1, 255), + "eps": 0, + }, + { + "name": "ui8_16x128", + "dtype": np.uint8, + "shape": (16, 128), + "valid_shape": (16, 127), + "dst_shape": (1, 128), + "dst_valid_shape": (1, 127), + "eps": 0, + }, + { + "name": "ui8_16x256", + "dtype": np.uint8, + "shape": (16, 256), + "valid_shape": (15, 255), + "dst_shape": (1, 256), + "dst_valid_shape": (1, 255), + "eps": 0, + }, + { + "name": "ui16_1x256", + "dtype": np.uint16, + "shape": (1, 256), + "valid_shape": (1, 255), + "dst_shape": (1, 256), + "dst_valid_shape": (1, 255), + "eps": 0, + }, + { + "name": "ui16_16x128", + "dtype": np.uint16, + "shape": (16, 128), + "valid_shape": (16, 127), + "dst_shape": (1, 128), + "dst_valid_shape": (1, 127), + "eps": 0, + }, + { + "name": "ui16_16x256", + "dtype": np.uint16, + "shape": (16, 256), + "valid_shape": (15, 255), + "dst_shape": (1, 256), + "dst_valid_shape": (1, 255), + "eps": 0, + }, + { + "name": "ui32_1x256", + "dtype": np.uint32, + "shape": (1, 256), + "valid_shape": (1, 255), + "dst_shape": (1, 256), + "dst_valid_shape": (1, 255), + "eps": 0, + }, + { + "name": "ui32_16x128", + "dtype": np.uint32, + "shape": (16, 128), + "valid_shape": (16, 127), + "dst_shape": (1, 128), + "dst_valid_shape": (1, 127), + "eps": 0, + }, + { + "name": "ui32_16x256", + "dtype": np.uint32, + "shape": (16, 256), + "valid_shape": (15, 255), + "dst_shape": (1, 256), + "dst_valid_shape": (1, 255), + "eps": 0, + }, +] + +_SMOKE_CASE_NAMES = ['f32_1x256', 'f16_1x256'] +_SMOKE_CASE_NAME_SET = set(_SMOKE_CASE_NAMES) +_missing = [name for name in _SMOKE_CASE_NAMES if name not in {case["name"] for case in CASES}] +if _missing: + raise RuntimeError("unknown smoke case(s): " + ", ".join(_missing)) +CASES = [case for case in CASES if case["name"] in _SMOKE_CASE_NAME_SET] diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolmax/compare.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolmax/compare.py new file mode 100644 index 000000000..040ae0bb8 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolmax/compare.py @@ -0,0 +1,50 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import os +import sys +import numpy as np + +from cases import CASES +from st_common import result_cmp, style_fail, style_pass, validate_cases + + +def main(): + validate_cases(CASES) + case_filter = sys.argv[1] if len(sys.argv) > 1 else None + + all_passed = True + for case in CASES: + if case_filter is not None and case["name"] != case_filter: + continue + + case_dir = case["name"] + dst_shape = case["dst_shape"] + dst_valid_shape = case["dst_valid_shape"] + vr, vc = dst_valid_shape + + golden = np.fromfile(os.path.join(case_dir, "golden.bin"), dtype=case["dtype"]).reshape(dst_shape) + output = np.fromfile(os.path.join(case_dir, "output.bin"), dtype=case["dtype"]).reshape(dst_shape) + + ok = result_cmp(golden[:vr, :vc], output[:vr, :vc], case["eps"]) + if ok: + print(style_pass(f"[INFO] {case['name']}: compare passed")) + else: + print(style_fail(f"[ERROR] {case['name']}: compare failed")) + all_passed = False + + if not all_passed: + sys.exit(2) + print(style_pass("[INFO] all cases passed")) + + +if __name__ == "__main__": + main() diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolmax/gen_data.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolmax/gen_data.py new file mode 100644 index 000000000..3fd1c9cbc --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolmax/gen_data.py @@ -0,0 +1,35 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import numpy as np +from cases import CASES +from st_common import validate_cases, setup_case_rng, save_case_data + +validate_cases(CASES) + +for case in CASES: + setup_case_rng(case) + + dtype = case["dtype"] + shape = case["shape"] + valid_shape = case["valid_shape"] + dst_shape = case["dst_shape"] + dst_valid_shape = case["dst_valid_shape"] + + input1 = np.random.randint(1, 10, size=shape).astype(dtype) + + vr, vc = valid_shape + golden = np.zeros(dst_shape, dtype=dtype) + golden_result = np.max(input1[:vr, :vc], axis=0, keepdims=True).astype(dtype) + golden[:1, :vc] = golden_result + + save_case_data(case["name"], {"input": input1, "golden": golden}) + print(f"[INFO] gen_data: {case['name']} shape={shape} valid_shape={valid_shape} dst_shape={dst_shape} dtype={dtype.__name__}") diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolmax/launch.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolmax/launch.cpp new file mode 100644 index 000000000..4570c1710 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolmax/launch.cpp @@ -0,0 +1,28 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +#include + +#ifndef AICORE +#define AICORE [aicore] +#endif + +// Case 0: f32 1x256 (input: 1x256, output: 1x256) + +extern "C" __global__ AICORE void TCOLMAX_f32_1x256(__gm__ float *dst, __gm__ float *src); +extern "C" __global__ AICORE void TCOLMAX_f16_1x256(__gm__ half *dst, __gm__ half *src); + +void LaunchTCOLMAX_f32_1x256(float *dst, float *src, void *stream) { + TCOLMAX_f32_1x256<<<1, nullptr, stream>>>((__gm__ float *)dst, (__gm__ float *)src); +} + + + +void LaunchTCOLMAX_f16_1x256(void *dst, void *src, void *stream) { + TCOLMAX_f16_1x256<<<1, nullptr, stream>>>((__gm__ half *)dst, (__gm__ half *)src); +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolmax/main.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolmax/main.cpp new file mode 100644 index 000000000..95765be9d --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolmax/main.cpp @@ -0,0 +1,156 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// Host driver for TileLang tcolmax ST — case-table driven. +// Each case launches a different kernel variant, reads/writes from per-case subdirectory. +// Numerical comparison is done externally by compare.py. + +#include "acl/acl.h" +#include "test_common.h" +#include +#include +#include +#include +#include +#include + +using namespace PtoTestCommon; + +// Kernel launch wrappers (defined in launch.cpp) +void LaunchTCOLMAX_f32_1x256(float *dst, float *src, void *stream); +void LaunchTCOLMAX_f32_16x128(float *dst, float *src, void *stream); +void LaunchTCOLMAX_f16_1x256(void *dst, void *src, void *stream); +void LaunchTCOLMAX_f16_16x256(void *dst, void *src, void *stream); +void LaunchTCOLMAX_i8_16x128(void *dst, void *src, void *stream); +void LaunchTCOLMAX_i16_1x256(void *dst, void *src, void *stream); +void LaunchTCOLMAX_i16_16x256(void *dst, void *src, void *stream); +void LaunchTCOLMAX_i32_16x128(void *dst, void *src, void *stream); +void LaunchTCOLMAX_ui8_1x256(void *dst, void *src, void *stream); +void LaunchTCOLMAX_ui8_16x256(void *dst, void *src, void *stream); +void LaunchTCOLMAX_ui16_16x128(void *dst, void *src, void *stream); +void LaunchTCOLMAX_ui32_1x256(void *dst, void *src, void *stream); +void LaunchTCOLMAX_ui32_16x256(void *dst, void *src, void *stream); + +using LaunchFnFloat = void (*)(float *, float *, void *); +using LaunchFnVoid = void (*)(void *, void *, void *); + +struct TestCase { + const char *name; + void *launch; + size_t srcRows; + size_t srcCols; + size_t srcValidRows; + size_t srcValidCols; + size_t dstRows; + size_t dstCols; + size_t dstValidCols; + size_t elemSize; + bool isFp16; +}; + +static const TestCase kCases[] = { +{"f32_1x256", (void*)LaunchTCOLMAX_f32_1x256, 1, 256, 1, 255, 1, 256, 255, sizeof(float), false}, +{"f16_1x256", (void*)LaunchTCOLMAX_f16_1x256, 1, 256, 1, 255, 1, 256, 255, 2, true}, +}; +static constexpr size_t kNumCases = sizeof(kCases) / sizeof(kCases[0]); + +static int RunCase(const TestCase &tc, int deviceId, aclrtStream stream) { + int rc = 0; + const size_t srcElemCount = tc.srcRows * tc.srcCols; + const size_t srcFileSize = srcElemCount * tc.elemSize; + const size_t dstElemCount = tc.dstRows * tc.dstCols; + const size_t dstFileSize = dstElemCount * tc.elemSize; + + std::printf("[INFO] === case: %s (src=%zux%zu, dst=%zux%zu, fp16=%d) ===\n", + tc.name, tc.srcRows, tc.srcCols, tc.dstRows, tc.dstCols, tc.isFp16); + + std::string caseDir = std::string("./") + tc.name; + size_t srcFileSizeVar = srcFileSize; + size_t dstFileSizeVar = dstFileSize; + + void *srcHost = nullptr, *dstHost = nullptr; + void *srcDevice = nullptr, *dstDevice = nullptr; + + aclrtMallocHost(&srcHost, srcFileSize); + aclrtMallocHost(&dstHost, dstFileSize); + + aclrtMalloc(&srcDevice, srcFileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc(&dstDevice, dstFileSize, ACL_MEM_MALLOC_HUGE_FIRST); + + if (!ReadFile((caseDir + "/input.bin").c_str(), srcFileSizeVar, srcHost, srcFileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input.bin\n", caseDir.c_str()); + rc = 1; + } + + if (rc == 0) { + aclrtMemcpy(srcDevice, srcFileSize, srcHost, srcFileSize, ACL_MEMCPY_HOST_TO_DEVICE); + + if (tc.isFp16) { + LaunchFnVoid launch = (LaunchFnVoid)tc.launch; + launch(dstDevice, srcDevice, stream); + } else { + LaunchFnFloat launch = (LaunchFnFloat)tc.launch; + launch((float*)dstDevice, (float*)srcDevice, stream); + } + + aclrtSynchronizeStream(stream); + aclrtMemcpy(dstHost, dstFileSize, dstDevice, dstFileSize, ACL_MEMCPY_DEVICE_TO_HOST); + } + + if (rc == 0 && !WriteFile((caseDir + "/output.bin").c_str(), dstHost, dstFileSizeVar)) { + std::fprintf(stderr, "[ERROR] failed to write %s/output.bin\n", caseDir.c_str()); + rc = 1; + } + + if (srcDevice != nullptr) + aclrtFree(srcDevice); + if (dstDevice != nullptr) + aclrtFree(dstDevice); + if (srcHost != nullptr) + aclrtFreeHost(srcHost); + if (dstHost != nullptr) + aclrtFreeHost(dstHost); + + if (rc == 0) + std::printf("[INFO] case %s done\n", tc.name); + return rc; +} + +int main(int argc, char *argv[]) { + const char *caseFilter = (argc > 1) ? argv[1] : nullptr; + + int rc = 0; + int deviceId = 0; + aclrtStream stream = nullptr; + + aclInit(nullptr); + if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { + deviceId = std::atoi(envDevice); + } + aclrtSetDevice(deviceId); + aclrtCreateStream(&stream); + + for (size_t i = 0; i < kNumCases; ++i) { + if (caseFilter != nullptr && std::strcmp(kCases[i].name, caseFilter) != 0) { + continue; + } + int ret = RunCase(kCases[i], deviceId, stream); + if (ret != 0) { + std::fprintf(stderr, "[ERROR] case %s failed\n", kCases[i].name); + rc = 1; + break; + } + } + + if (stream != nullptr) + aclrtDestroyStream(stream); + aclrtResetDevice(deviceId); + aclFinalize(); + + return rc; +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolmax/tcolmax.pto b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolmax/tcolmax.pto new file mode 100644 index 000000000..bcfcc647a --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolmax/tcolmax.pto @@ -0,0 +1,99 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// TileLang ST kernels for pto.tcolmax: tload(src) + tcolmax(dst, src) + tstore(dst). +// Multiple cases with different shapes in a single module. +// Compiled by ptoas --enable-insert-sync --enable-tile-op-expand --pto-backend=vpto +// to produce a fatobj object. + +module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { + // Case 0: f32 1x256 (input: 1x256, output: 1x256) + func.func @TCOLMAX_f32_1x256(%dst_ptr: !pto.ptr, %src_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c256 = arith.constant 256 : index + %c255 = arith.constant 255 : index + + %src_view = pto.make_tensor_view %src_ptr, + shape = [%c1, %c1, %c1, %c1, %c256], + strides = [%c256, %c256, %c256, %c256, %c1] + : !pto.tensor_view<1x1x1x1x256xf32> + %dst_view = pto.make_tensor_view %dst_ptr, + shape = [%c1, %c1, %c1, %c1, %c256], + strides = [%c256, %c256, %c256, %c256, %c1] + : !pto.tensor_view<1x1x1x1x256xf32> + + %src_part = pto.partition_view %src_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c1, %c255] + : !pto.tensor_view<1x1x1x1x256xf32> -> !pto.partition_tensor_view<1x1x1x1x255xf32> + %dst_part = pto.partition_view %dst_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c1, %c255] + : !pto.tensor_view<1x1x1x1x256xf32> -> !pto.partition_tensor_view<1x1x1x1x255xf32> + + %src = pto.alloc_tile + : !pto.tile_buf + %dst = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%src_part : !pto.partition_tensor_view<1x1x1x1x255xf32>) + outs(%src : !pto.tile_buf) + + pto.tcolmax ins(%src : !pto.tile_buf) + outs(%dst : !pto.tile_buf) + + pto.tstore ins(%dst : !pto.tile_buf) + outs(%dst_part : !pto.partition_tensor_view<1x1x1x1x255xf32>) + return + } + +// Case 1: f32 16x128 (input: 16x128, output: 1x128) + + func.func @TCOLMAX_f16_1x256(%dst_ptr: !pto.ptr, %src_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c256 = arith.constant 256 : index + %c255 = arith.constant 255 : index + + %src_view = pto.make_tensor_view %src_ptr, + shape = [%c1, %c1, %c1, %c1, %c256], + strides = [%c256, %c256, %c256, %c256, %c1] + : !pto.tensor_view<1x1x1x1x256xf16> + %dst_view = pto.make_tensor_view %dst_ptr, + shape = [%c1, %c1, %c1, %c1, %c256], + strides = [%c256, %c256, %c256, %c256, %c1] + : !pto.tensor_view<1x1x1x1x256xf16> + + %src_part = pto.partition_view %src_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c1, %c255] + : !pto.tensor_view<1x1x1x1x256xf16> -> !pto.partition_tensor_view<1x1x1x1x255xf16> + %dst_part = pto.partition_view %dst_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c1, %c255] + : !pto.tensor_view<1x1x1x1x256xf16> -> !pto.partition_tensor_view<1x1x1x1x255xf16> + + %src = pto.alloc_tile + : !pto.tile_buf + %dst = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%src_part : !pto.partition_tensor_view<1x1x1x1x255xf16>) + outs(%src : !pto.tile_buf) + + pto.tcolmax ins(%src : !pto.tile_buf) + outs(%dst : !pto.tile_buf) + + pto.tstore ins(%dst : !pto.tile_buf) + outs(%dst_part : !pto.partition_tensor_view<1x1x1x1x255xf16>) + return + } + + // Case 4: f16 16x128 (input: 16x128, output: 1x128) +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolmin/CMakeLists.txt b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolmin/CMakeLists.txt new file mode 100644 index 000000000..26f50aa26 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolmin/CMakeLists.txt @@ -0,0 +1,9 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +pto_tilelang_vec_st(tcolmin) diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolmin/cases.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolmin/cases.py new file mode 100644 index 000000000..8ae355d53 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolmin/cases.py @@ -0,0 +1,252 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +"""Single source of truth for tcolmin ST test cases. + +Each case defines: + - name: case identifier, used as subdirectory name and by main.cpp kCases[]. + - dtype: numpy dtype (e.g. np.float32). + - shape: (rows, cols) — allocated tile dimensions for input. + - valid_shape: (valid_rows, valid_cols) — effective computation region for input. + - dst_shape: (1, cols) — allocated tile dimensions for output. + - dst_valid_shape: (1, valid_cols) — effective computation region for output. + - eps: tolerance for numpy.allclose (atol and rtol). + +gen_data.py and compare.py both import this list to avoid redundant definitions. +""" + +import numpy as np + +CASES = [ + { + "name": "f32_1x256", + "dtype": np.float32, + "shape": (1, 256), + "valid_shape": (1, 255), + "dst_shape": (1, 256), + "dst_valid_shape": (1, 255), + "eps": 1e-6, + }, + { + "name": "f32_16x128", + "dtype": np.float32, + "shape": (16, 128), + "valid_shape": (16, 127), + "dst_shape": (1, 128), + "dst_valid_shape": (1, 127), + "eps": 1e-6, + }, + { + "name": "f32_16x256", + "dtype": np.float32, + "shape": (16, 256), + "valid_shape": (15, 255), + "dst_shape": (1, 256), + "dst_valid_shape": (1, 255), + "eps": 1e-6, + }, + { + "name": "f16_1x256", + "dtype": np.float16, + "shape": (1, 256), + "valid_shape": (1, 255), + "dst_shape": (1, 256), + "dst_valid_shape": (1, 255), + "eps": 1e-3, + }, + { + "name": "f16_16x128", + "dtype": np.float16, + "shape": (16, 128), + "valid_shape": (16, 127), + "dst_shape": (1, 128), + "dst_valid_shape": (1, 127), + "eps": 1e-3, + }, + { + "name": "f16_16x256", + "dtype": np.float16, + "shape": (16, 256), + "valid_shape": (15, 255), + "dst_shape": (1, 256), + "dst_valid_shape": (1, 255), + "eps": 1e-3, + }, + { + "name": "i8_1x256", + "dtype": np.int8, + "shape": (1, 256), + "valid_shape": (1, 255), + "dst_shape": (1, 256), + "dst_valid_shape": (1, 255), + "eps": 0, + }, + { + "name": "i8_16x128", + "dtype": np.int8, + "shape": (16, 128), + "valid_shape": (16, 127), + "dst_shape": (1, 128), + "dst_valid_shape": (1, 127), + "eps": 0, + }, + { + "name": "i8_16x256", + "dtype": np.int8, + "shape": (16, 256), + "valid_shape": (15, 255), + "dst_shape": (1, 256), + "dst_valid_shape": (1, 255), + "eps": 0, + }, + { + "name": "i16_1x256", + "dtype": np.int16, + "shape": (1, 256), + "valid_shape": (1, 255), + "dst_shape": (1, 256), + "dst_valid_shape": (1, 255), + "eps": 0, + }, + { + "name": "i16_16x128", + "dtype": np.int16, + "shape": (16, 128), + "valid_shape": (16, 127), + "dst_shape": (1, 128), + "dst_valid_shape": (1, 127), + "eps": 0, + }, + { + "name": "i16_16x256", + "dtype": np.int16, + "shape": (16, 256), + "valid_shape": (15, 255), + "dst_shape": (1, 256), + "dst_valid_shape": (1, 255), + "eps": 0, + }, + { + "name": "i32_1x256", + "dtype": np.int32, + "shape": (1, 256), + "valid_shape": (1, 255), + "dst_shape": (1, 256), + "dst_valid_shape": (1, 255), + "eps": 0, + }, + { + "name": "i32_16x128", + "dtype": np.int32, + "shape": (16, 128), + "valid_shape": (16, 127), + "dst_shape": (1, 128), + "dst_valid_shape": (1, 127), + "eps": 0, + }, + { + "name": "i32_16x256", + "dtype": np.int32, + "shape": (16, 256), + "valid_shape": (15, 255), + "dst_shape": (1, 256), + "dst_valid_shape": (1, 255), + "eps": 0, + }, + { + "name": "ui8_1x256", + "dtype": np.uint8, + "shape": (1, 256), + "valid_shape": (1, 255), + "dst_shape": (1, 256), + "dst_valid_shape": (1, 255), + "eps": 0, + }, + { + "name": "ui8_16x128", + "dtype": np.uint8, + "shape": (16, 128), + "valid_shape": (16, 127), + "dst_shape": (1, 128), + "dst_valid_shape": (1, 127), + "eps": 0, + }, + { + "name": "ui8_16x256", + "dtype": np.uint8, + "shape": (16, 256), + "valid_shape": (15, 255), + "dst_shape": (1, 256), + "dst_valid_shape": (1, 255), + "eps": 0, + }, + { + "name": "ui16_1x256", + "dtype": np.uint16, + "shape": (1, 256), + "valid_shape": (1, 255), + "dst_shape": (1, 256), + "dst_valid_shape": (1, 255), + "eps": 0, + }, + { + "name": "ui16_16x128", + "dtype": np.uint16, + "shape": (16, 128), + "valid_shape": (16, 127), + "dst_shape": (1, 128), + "dst_valid_shape": (1, 127), + "eps": 0, + }, + { + "name": "ui16_16x256", + "dtype": np.uint16, + "shape": (16, 256), + "valid_shape": (15, 255), + "dst_shape": (1, 256), + "dst_valid_shape": (1, 255), + "eps": 0, + }, + { + "name": "ui32_1x256", + "dtype": np.uint32, + "shape": (1, 256), + "valid_shape": (1, 255), + "dst_shape": (1, 256), + "dst_valid_shape": (1, 255), + "eps": 0, + }, + { + "name": "ui32_16x128", + "dtype": np.uint32, + "shape": (16, 128), + "valid_shape": (16, 127), + "dst_shape": (1, 128), + "dst_valid_shape": (1, 127), + "eps": 0, + }, + { + "name": "ui32_16x256", + "dtype": np.uint32, + "shape": (16, 256), + "valid_shape": (15, 255), + "dst_shape": (1, 256), + "dst_valid_shape": (1, 255), + "eps": 0, + }, +] + +_SMOKE_CASE_NAMES = ['f32_1x256', 'f16_1x256'] +_SMOKE_CASE_NAME_SET = set(_SMOKE_CASE_NAMES) +_missing = [name for name in _SMOKE_CASE_NAMES if name not in {case["name"] for case in CASES}] +if _missing: + raise RuntimeError("unknown smoke case(s): " + ", ".join(_missing)) +CASES = [case for case in CASES if case["name"] in _SMOKE_CASE_NAME_SET] diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolmin/compare.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolmin/compare.py new file mode 100644 index 000000000..040ae0bb8 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolmin/compare.py @@ -0,0 +1,50 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import os +import sys +import numpy as np + +from cases import CASES +from st_common import result_cmp, style_fail, style_pass, validate_cases + + +def main(): + validate_cases(CASES) + case_filter = sys.argv[1] if len(sys.argv) > 1 else None + + all_passed = True + for case in CASES: + if case_filter is not None and case["name"] != case_filter: + continue + + case_dir = case["name"] + dst_shape = case["dst_shape"] + dst_valid_shape = case["dst_valid_shape"] + vr, vc = dst_valid_shape + + golden = np.fromfile(os.path.join(case_dir, "golden.bin"), dtype=case["dtype"]).reshape(dst_shape) + output = np.fromfile(os.path.join(case_dir, "output.bin"), dtype=case["dtype"]).reshape(dst_shape) + + ok = result_cmp(golden[:vr, :vc], output[:vr, :vc], case["eps"]) + if ok: + print(style_pass(f"[INFO] {case['name']}: compare passed")) + else: + print(style_fail(f"[ERROR] {case['name']}: compare failed")) + all_passed = False + + if not all_passed: + sys.exit(2) + print(style_pass("[INFO] all cases passed")) + + +if __name__ == "__main__": + main() diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolmin/gen_data.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolmin/gen_data.py new file mode 100644 index 000000000..16b9a7a4b --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolmin/gen_data.py @@ -0,0 +1,35 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import numpy as np +from cases import CASES +from st_common import validate_cases, setup_case_rng, save_case_data + +validate_cases(CASES) + +for case in CASES: + setup_case_rng(case) + + dtype = case["dtype"] + shape = case["shape"] + valid_shape = case["valid_shape"] + dst_shape = case["dst_shape"] + dst_valid_shape = case["dst_valid_shape"] + + input1 = np.random.randint(1, 10, size=shape).astype(dtype) + + vr, vc = valid_shape + golden = np.zeros(dst_shape, dtype=dtype) + golden_result = np.min(input1[:vr, :vc], axis=0, keepdims=True).astype(dtype) + golden[:1, :vc] = golden_result + + save_case_data(case["name"], {"input": input1, "golden": golden}) + print(f"[INFO] gen_data: {case['name']} shape={shape} valid_shape={valid_shape} dst_shape={dst_shape} dtype={dtype.__name__}") diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolmin/launch.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolmin/launch.cpp new file mode 100644 index 000000000..e671a8ac6 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolmin/launch.cpp @@ -0,0 +1,28 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +#include + +#ifndef AICORE +#define AICORE [aicore] +#endif + +// Case 0: f32 1x256 (input: 1x256, output: 1x256) + +extern "C" __global__ AICORE void TCOLMIN_f32_1x256(__gm__ float *dst, __gm__ float *src); +extern "C" __global__ AICORE void TCOLMIN_f16_1x256(__gm__ half *dst, __gm__ half *src); + +void LaunchTCOLMIN_f32_1x256(float *dst, float *src, void *stream) { + TCOLMIN_f32_1x256<<<1, nullptr, stream>>>((__gm__ float *)dst, (__gm__ float *)src); +} + + + +void LaunchTCOLMIN_f16_1x256(void *dst, void *src, void *stream) { + TCOLMIN_f16_1x256<<<1, nullptr, stream>>>((__gm__ half *)dst, (__gm__ half *)src); +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolmin/main.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolmin/main.cpp new file mode 100644 index 000000000..15f336405 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolmin/main.cpp @@ -0,0 +1,156 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// Host driver for TileLang tcolmin ST — case-table driven. +// Each case launches a different kernel variant, reads/writes from per-case subdirectory. +// Numerical comparison is done externally by compare.py. + +#include "acl/acl.h" +#include "test_common.h" +#include +#include +#include +#include +#include +#include + +using namespace PtoTestCommon; + +// Kernel launch wrappers (defined in launch.cpp) +void LaunchTCOLMIN_f32_1x256(float *dst, float *src, void *stream); +void LaunchTCOLMIN_f32_16x128(float *dst, float *src, void *stream); +void LaunchTCOLMIN_f16_1x256(void *dst, void *src, void *stream); +void LaunchTCOLMIN_f16_16x256(void *dst, void *src, void *stream); +void LaunchTCOLMIN_i8_16x128(void *dst, void *src, void *stream); +void LaunchTCOLMIN_i16_1x256(void *dst, void *src, void *stream); +void LaunchTCOLMIN_i16_16x256(void *dst, void *src, void *stream); +void LaunchTCOLMIN_i32_16x128(void *dst, void *src, void *stream); +void LaunchTCOLMIN_ui8_1x256(void *dst, void *src, void *stream); +void LaunchTCOLMIN_ui8_16x256(void *dst, void *src, void *stream); +void LaunchTCOLMIN_ui16_16x128(void *dst, void *src, void *stream); +void LaunchTCOLMIN_ui32_1x256(void *dst, void *src, void *stream); +void LaunchTCOLMIN_ui32_16x256(void *dst, void *src, void *stream); + +using LaunchFnFloat = void (*)(float *, float *, void *); +using LaunchFnVoid = void (*)(void *, void *, void *); + +struct TestCase { + const char *name; + void *launch; + size_t srcRows; + size_t srcCols; + size_t srcValidRows; + size_t srcValidCols; + size_t dstRows; + size_t dstCols; + size_t dstValidCols; + size_t elemSize; + bool isFp16; +}; + +static const TestCase kCases[] = { +{"f32_1x256", (void*)LaunchTCOLMIN_f32_1x256, 1, 256, 1, 255, 1, 256, 255, sizeof(float), false}, +{"f16_1x256", (void*)LaunchTCOLMIN_f16_1x256, 1, 256, 1, 255, 1, 256, 255, 2, true}, +}; +static constexpr size_t kNumCases = sizeof(kCases) / sizeof(kCases[0]); + +static int RunCase(const TestCase &tc, int deviceId, aclrtStream stream) { + int rc = 0; + const size_t srcElemCount = tc.srcRows * tc.srcCols; + const size_t srcFileSize = srcElemCount * tc.elemSize; + const size_t dstElemCount = tc.dstRows * tc.dstCols; + const size_t dstFileSize = dstElemCount * tc.elemSize; + + std::printf("[INFO] === case: %s (src=%zux%zu, dst=%zux%zu, fp16=%d) ===\n", + tc.name, tc.srcRows, tc.srcCols, tc.dstRows, tc.dstCols, tc.isFp16); + + std::string caseDir = std::string("./") + tc.name; + size_t srcFileSizeVar = srcFileSize; + size_t dstFileSizeVar = dstFileSize; + + void *srcHost = nullptr, *dstHost = nullptr; + void *srcDevice = nullptr, *dstDevice = nullptr; + + aclrtMallocHost(&srcHost, srcFileSize); + aclrtMallocHost(&dstHost, dstFileSize); + + aclrtMalloc(&srcDevice, srcFileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc(&dstDevice, dstFileSize, ACL_MEM_MALLOC_HUGE_FIRST); + + if (!ReadFile((caseDir + "/input.bin").c_str(), srcFileSizeVar, srcHost, srcFileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input.bin\n", caseDir.c_str()); + rc = 1; + } + + if (rc == 0) { + aclrtMemcpy(srcDevice, srcFileSize, srcHost, srcFileSize, ACL_MEMCPY_HOST_TO_DEVICE); + + if (tc.isFp16) { + LaunchFnVoid launch = (LaunchFnVoid)tc.launch; + launch(dstDevice, srcDevice, stream); + } else { + LaunchFnFloat launch = (LaunchFnFloat)tc.launch; + launch((float*)dstDevice, (float*)srcDevice, stream); + } + + aclrtSynchronizeStream(stream); + aclrtMemcpy(dstHost, dstFileSize, dstDevice, dstFileSize, ACL_MEMCPY_DEVICE_TO_HOST); + } + + if (rc == 0 && !WriteFile((caseDir + "/output.bin").c_str(), dstHost, dstFileSizeVar)) { + std::fprintf(stderr, "[ERROR] failed to write %s/output.bin\n", caseDir.c_str()); + rc = 1; + } + + if (srcDevice != nullptr) + aclrtFree(srcDevice); + if (dstDevice != nullptr) + aclrtFree(dstDevice); + if (srcHost != nullptr) + aclrtFreeHost(srcHost); + if (dstHost != nullptr) + aclrtFreeHost(dstHost); + + if (rc == 0) + std::printf("[INFO] case %s done\n", tc.name); + return rc; +} + +int main(int argc, char *argv[]) { + const char *caseFilter = (argc > 1) ? argv[1] : nullptr; + + int rc = 0; + int deviceId = 0; + aclrtStream stream = nullptr; + + aclInit(nullptr); + if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { + deviceId = std::atoi(envDevice); + } + aclrtSetDevice(deviceId); + aclrtCreateStream(&stream); + + for (size_t i = 0; i < kNumCases; ++i) { + if (caseFilter != nullptr && std::strcmp(kCases[i].name, caseFilter) != 0) { + continue; + } + int ret = RunCase(kCases[i], deviceId, stream); + if (ret != 0) { + std::fprintf(stderr, "[ERROR] case %s failed\n", kCases[i].name); + rc = 1; + break; + } + } + + if (stream != nullptr) + aclrtDestroyStream(stream); + aclrtResetDevice(deviceId); + aclFinalize(); + + return rc; +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolmin/tcolmin.pto b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolmin/tcolmin.pto new file mode 100644 index 000000000..191e3dcd5 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolmin/tcolmin.pto @@ -0,0 +1,99 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// TileLang ST kernels for pto.tcolmin: tload(src) + tcolmin(dst, src) + tstore(dst). +// Multiple cases with different shapes in a single module. +// Compiled by ptoas --enable-insert-sync --enable-tile-op-expand --pto-backend=vpto +// to produce a fatobj object. + +module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { + // Case 0: f32 1x256 (input: 1x256, output: 1x256) + func.func @TCOLMIN_f32_1x256(%dst_ptr: !pto.ptr, %src_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c256 = arith.constant 256 : index + %c255 = arith.constant 255 : index + + %src_view = pto.make_tensor_view %src_ptr, + shape = [%c1, %c1, %c1, %c1, %c256], + strides = [%c256, %c256, %c256, %c256, %c1] + : !pto.tensor_view<1x1x1x1x256xf32> + %dst_view = pto.make_tensor_view %dst_ptr, + shape = [%c1, %c1, %c1, %c1, %c256], + strides = [%c256, %c256, %c256, %c256, %c1] + : !pto.tensor_view<1x1x1x1x256xf32> + + %src_part = pto.partition_view %src_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c1, %c255] + : !pto.tensor_view<1x1x1x1x256xf32> -> !pto.partition_tensor_view<1x1x1x1x255xf32> + %dst_part = pto.partition_view %dst_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c1, %c255] + : !pto.tensor_view<1x1x1x1x256xf32> -> !pto.partition_tensor_view<1x1x1x1x255xf32> + + %src = pto.alloc_tile + : !pto.tile_buf + %dst = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%src_part : !pto.partition_tensor_view<1x1x1x1x255xf32>) + outs(%src : !pto.tile_buf) + + pto.tcolmin ins(%src : !pto.tile_buf) + outs(%dst : !pto.tile_buf) + + pto.tstore ins(%dst : !pto.tile_buf) + outs(%dst_part : !pto.partition_tensor_view<1x1x1x1x255xf32>) + return + } + + // Case 1: f32 16x128 (input: 16x128, output: 1x128) + + func.func @TCOLMIN_f16_1x256(%dst_ptr: !pto.ptr, %src_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c256 = arith.constant 256 : index + %c255 = arith.constant 255 : index + + %src_view = pto.make_tensor_view %src_ptr, + shape = [%c1, %c1, %c1, %c1, %c256], + strides = [%c256, %c256, %c256, %c256, %c1] + : !pto.tensor_view<1x1x1x1x256xf16> + %dst_view = pto.make_tensor_view %dst_ptr, + shape = [%c1, %c1, %c1, %c1, %c256], + strides = [%c256, %c256, %c256, %c256, %c1] + : !pto.tensor_view<1x1x1x1x256xf16> + + %src_part = pto.partition_view %src_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c1, %c255] + : !pto.tensor_view<1x1x1x1x256xf16> -> !pto.partition_tensor_view<1x1x1x1x255xf16> + %dst_part = pto.partition_view %dst_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c1, %c255] + : !pto.tensor_view<1x1x1x1x256xf16> -> !pto.partition_tensor_view<1x1x1x1x255xf16> + + %src = pto.alloc_tile + : !pto.tile_buf + %dst = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%src_part : !pto.partition_tensor_view<1x1x1x1x255xf16>) + outs(%src : !pto.tile_buf) + + pto.tcolmin ins(%src : !pto.tile_buf) + outs(%dst : !pto.tile_buf) + + pto.tstore ins(%dst : !pto.tile_buf) + outs(%dst_part : !pto.partition_tensor_view<1x1x1x1x255xf16>) + return + } + + // Case 4: f16 16x128 (input: 16x128, output: 1x128) +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolprod/CMakeLists.txt b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolprod/CMakeLists.txt new file mode 100644 index 000000000..195b86b9d --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolprod/CMakeLists.txt @@ -0,0 +1,9 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +pto_tilelang_vec_st(tcolprod) diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolprod/cases.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolprod/cases.py new file mode 100644 index 000000000..75398575f --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolprod/cases.py @@ -0,0 +1,171 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +"""Single source of truth for tcolprod ST test cases. + +Each case defines: + - name: case identifier, used as subdirectory name and by main.cpp kCases[]. + - dtype: numpy dtype (e.g. np.float32). + - shape: (rows, cols) — allocated tile dimensions for input. + - valid_shape: (valid_rows, valid_cols) — effective computation region for input. + - dst_shape: (1, cols) — allocated tile dimensions for output. + - dst_valid_shape: (1, valid_cols) — effective computation region for output. + - eps: tolerance for numpy.allclose (atol and rtol). + +gen_data.py and compare.py both import this list to avoid redundant definitions. +""" + +import numpy as np + +CASES = [ + { + "name": "f32_1x256", + "dtype": np.float32, + "shape": (1, 256), + "valid_shape": (1, 255), + "dst_shape": (1, 256), + "dst_valid_shape": (1, 255), + "eps": 1e-6, + }, + { + "name": "f32_16x128", + "dtype": np.float32, + "shape": (16, 128), + "valid_shape": (16, 127), + "dst_shape": (1, 128), + "dst_valid_shape": (1, 127), + "eps": 1e-6, + }, + { + "name": "f32_16x256", + "dtype": np.float32, + "shape": (16, 256), + "valid_shape": (15, 255), + "dst_shape": (1, 256), + "dst_valid_shape": (1, 255), + "eps": 1e-6, + }, + { + "name": "i16_1x256", + "dtype": np.int16, + "shape": (1, 256), + "valid_shape": (1, 255), + "dst_shape": (1, 256), + "dst_valid_shape": (1, 255), + "eps": 0, + }, + { + "name": "i16_16x128", + "dtype": np.int16, + "shape": (16, 128), + "valid_shape": (16, 127), + "dst_shape": (1, 128), + "dst_valid_shape": (1, 127), + "eps": 0, + }, + { + "name": "i16_16x256", + "dtype": np.int16, + "shape": (16, 256), + "valid_shape": (15, 255), + "dst_shape": (1, 256), + "dst_valid_shape": (1, 255), + "eps": 0, + }, + { + "name": "ui16_1x256", + "dtype": np.uint16, + "shape": (1, 256), + "valid_shape": (1, 255), + "dst_shape": (1, 256), + "dst_valid_shape": (1, 255), + "eps": 0, + }, + { + "name": "ui16_16x128", + "dtype": np.uint16, + "shape": (16, 128), + "valid_shape": (16, 127), + "dst_shape": (1, 128), + "dst_valid_shape": (1, 127), + "eps": 0, + }, + { + "name": "ui16_16x256", + "dtype": np.uint16, + "shape": (16, 256), + "valid_shape": (15, 255), + "dst_shape": (1, 256), + "dst_valid_shape": (1, 255), + "eps": 0, + }, + { + "name": "i32_1x256", + "dtype": np.int32, + "shape": (1, 256), + "valid_shape": (1, 255), + "dst_shape": (1, 256), + "dst_valid_shape": (1, 255), + "eps": 0, + }, + { + "name": "i32_16x128", + "dtype": np.int32, + "shape": (16, 128), + "valid_shape": (16, 127), + "dst_shape": (1, 128), + "dst_valid_shape": (1, 127), + "eps": 0, + }, + { + "name": "i32_16x256", + "dtype": np.int32, + "shape": (16, 256), + "valid_shape": (15, 255), + "dst_shape": (1, 256), + "dst_valid_shape": (1, 255), + "eps": 0, + }, + { + "name": "ui32_1x256", + "dtype": np.uint32, + "shape": (1, 256), + "valid_shape": (1, 255), + "dst_shape": (1, 256), + "dst_valid_shape": (1, 255), + "eps": 0, + }, + { + "name": "ui32_16x128", + "dtype": np.uint32, + "shape": (16, 128), + "valid_shape": (16, 127), + "dst_shape": (1, 128), + "dst_valid_shape": (1, 127), + "eps": 0, + }, + { + "name": "ui32_16x256", + "dtype": np.uint32, + "shape": (16, 256), + "valid_shape": (15, 255), + "dst_shape": (1, 256), + "dst_valid_shape": (1, 255), + "eps": 0, + }, +] + +_SMOKE_CASE_NAMES = ['f32_1x256', 'i16_1x256'] +_SMOKE_CASE_NAME_SET = set(_SMOKE_CASE_NAMES) +_missing = [name for name in _SMOKE_CASE_NAMES if name not in {case["name"] for case in CASES}] +if _missing: + raise RuntimeError("unknown smoke case(s): " + ", ".join(_missing)) +CASES = [case for case in CASES if case["name"] in _SMOKE_CASE_NAME_SET] diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolprod/compare.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolprod/compare.py new file mode 100644 index 000000000..040ae0bb8 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolprod/compare.py @@ -0,0 +1,50 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import os +import sys +import numpy as np + +from cases import CASES +from st_common import result_cmp, style_fail, style_pass, validate_cases + + +def main(): + validate_cases(CASES) + case_filter = sys.argv[1] if len(sys.argv) > 1 else None + + all_passed = True + for case in CASES: + if case_filter is not None and case["name"] != case_filter: + continue + + case_dir = case["name"] + dst_shape = case["dst_shape"] + dst_valid_shape = case["dst_valid_shape"] + vr, vc = dst_valid_shape + + golden = np.fromfile(os.path.join(case_dir, "golden.bin"), dtype=case["dtype"]).reshape(dst_shape) + output = np.fromfile(os.path.join(case_dir, "output.bin"), dtype=case["dtype"]).reshape(dst_shape) + + ok = result_cmp(golden[:vr, :vc], output[:vr, :vc], case["eps"]) + if ok: + print(style_pass(f"[INFO] {case['name']}: compare passed")) + else: + print(style_fail(f"[ERROR] {case['name']}: compare failed")) + all_passed = False + + if not all_passed: + sys.exit(2) + print(style_pass("[INFO] all cases passed")) + + +if __name__ == "__main__": + main() diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolprod/gen_data.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolprod/gen_data.py new file mode 100644 index 000000000..ed7e4a3fa --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolprod/gen_data.py @@ -0,0 +1,35 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import numpy as np +from cases import CASES +from st_common import validate_cases, setup_case_rng, save_case_data + +validate_cases(CASES) + +for case in CASES: + setup_case_rng(case) + + dtype = case["dtype"] + shape = case["shape"] + valid_shape = case["valid_shape"] + dst_shape = case["dst_shape"] + dst_valid_shape = case["dst_valid_shape"] + + input1 = np.random.randint(1, 10, size=shape).astype(dtype) + + vr, vc = valid_shape + golden = np.zeros(dst_shape, dtype=dtype) + golden_result = np.prod(input1[:vr, :vc], axis=0, keepdims=True).astype(dtype) + golden[:1, :vc] = golden_result + + save_case_data(case["name"], {"input": input1, "golden": golden}) + print(f"[INFO] gen_data: {case['name']} shape={shape} valid_shape={valid_shape} dst_shape={dst_shape} dtype={dtype.__name__}") diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolprod/launch.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolprod/launch.cpp new file mode 100644 index 000000000..6ae387d1c --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolprod/launch.cpp @@ -0,0 +1,28 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +#include + +#ifndef AICORE +#define AICORE [aicore] +#endif + +// Case 0: f32 1x256 (input: 1x256, output: 1x256) + +extern "C" __global__ AICORE void TCOLPROD_f32_1x256(__gm__ float *dst, __gm__ float *src); +extern "C" __global__ AICORE void TCOLPROD_i16_1x256(__gm__ int16_t *dst, __gm__ int16_t *src); + +void LaunchTCOLPROD_f32_1x256(float *dst, float *src, void *stream) { + TCOLPROD_f32_1x256<<<1, nullptr, stream>>>((__gm__ float *)dst, (__gm__ float *)src); +} + + + +void LaunchTCOLPROD_i16_1x256(void *dst, void *src, void *stream) { + TCOLPROD_i16_1x256<<<1, nullptr, stream>>>((__gm__ int16_t *)dst, (__gm__ int16_t *)src); +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolprod/main.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolprod/main.cpp new file mode 100644 index 000000000..29b935e6c --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolprod/main.cpp @@ -0,0 +1,150 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. +// Host driver for TileLang tcolprod ST — case-table driven. +// Each case launches a different kernel variant, reads/writes from per-case subdirectory. +// Numerical comparison is done externally by compare.py. + +#include "acl/acl.h" +#include "test_common.h" +#include +#include +#include +#include +#include +#include + +using namespace PtoTestCommon; + +// Kernel launch wrappers (defined in launch.cpp) +void LaunchTCOLPROD_f32_1x256(float *dst, float *src, void *stream); +void LaunchTCOLPROD_f32_16x128(float *dst, float *src, void *stream); +void LaunchTCOLPROD_i16_1x256(void *dst, void *src, void *stream); +void LaunchTCOLPROD_i16_16x256(void *dst, void *src, void *stream); +void LaunchTCOLPROD_ui16_16x128(void *dst, void *src, void *stream); +void LaunchTCOLPROD_i32_1x256(void *dst, void *src, void *stream); +void LaunchTCOLPROD_i32_16x256(void *dst, void *src, void *stream); +void LaunchTCOLPROD_ui32_16x128(void *dst, void *src, void *stream); + +using LaunchFnFloat = void (*)(float *, float *, void *); +using LaunchFnVoid = void (*)(void *, void *, void *); + +struct TestCase { + const char *name; + void *launch; + size_t srcRows; + size_t srcCols; + size_t srcValidRows; + size_t srcValidCols; + size_t dstRows; + size_t dstCols; + size_t dstValidCols; + size_t elemSize; + bool isFp16; +}; + +static const TestCase kCases[] = { +{"f32_1x256", (void*)LaunchTCOLPROD_f32_1x256, 1, 256, 1, 255, 1, 256, 255, sizeof(float), false}, +{"i16_1x256", (void*)LaunchTCOLPROD_i16_1x256, 1, 256, 1, 255, 1, 256, 255, 2, true}, +}; +static constexpr size_t kNumCases = sizeof(kCases) / sizeof(kCases[0]); + +static int RunCase(const TestCase &tc, int deviceId, aclrtStream stream) { + int rc = 0; + const size_t srcElemCount = tc.srcRows * tc.srcCols; + const size_t srcFileSize = srcElemCount * tc.elemSize; + const size_t dstElemCount = tc.dstRows * tc.dstCols; + const size_t dstFileSize = dstElemCount * tc.elemSize; + + std::printf("[INFO] === case: %s (src=%zux%zu, dst=%zux%zu, fp16=%d) ===\n", + tc.name, tc.srcRows, tc.srcCols, tc.dstRows, tc.dstCols, tc.isFp16); + + std::string caseDir = std::string("./") + tc.name; + size_t srcFileSizeVar = srcFileSize; + size_t dstFileSizeVar = dstFileSize; + + void *srcHost = nullptr, *dstHost = nullptr; + void *srcDevice = nullptr, *dstDevice = nullptr; + + aclrtMallocHost(&srcHost, srcFileSize); + aclrtMallocHost(&dstHost, dstFileSize); + + aclrtMalloc(&srcDevice, srcFileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc(&dstDevice, dstFileSize, ACL_MEM_MALLOC_HUGE_FIRST); + + if (!ReadFile((caseDir + "/input.bin").c_str(), srcFileSizeVar, srcHost, srcFileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input.bin\n", caseDir.c_str()); + rc = 1; + } + + if (rc == 0) { + aclrtMemcpy(srcDevice, srcFileSize, srcHost, srcFileSize, ACL_MEMCPY_HOST_TO_DEVICE); + + if (tc.isFp16) { + LaunchFnVoid launch = (LaunchFnVoid)tc.launch; + launch(dstDevice, srcDevice, stream); + } else { + LaunchFnFloat launch = (LaunchFnFloat)tc.launch; + launch((float*)dstDevice, (float*)srcDevice, stream); + } + + aclrtSynchronizeStream(stream); + aclrtMemcpy(dstHost, dstFileSize, dstDevice, dstFileSize, ACL_MEMCPY_DEVICE_TO_HOST); + } + + if (rc == 0 && !WriteFile((caseDir + "/output.bin").c_str(), dstHost, dstFileSizeVar)) { + std::fprintf(stderr, "[ERROR] failed to write %s/output.bin\n", caseDir.c_str()); + rc = 1; + } + + if (srcDevice != nullptr) + aclrtFree(srcDevice); + if (dstDevice != nullptr) + aclrtFree(dstDevice); + if (srcHost != nullptr) + aclrtFreeHost(srcHost); + if (dstHost != nullptr) + aclrtFreeHost(dstHost); + + if (rc == 0) + std::printf("[INFO] case %s done\n", tc.name); + return rc; +} + +int main(int argc, char *argv[]) { + const char *caseFilter = (argc > 1) ? argv[1] : nullptr; + + int rc = 0; + int deviceId = 0; + aclrtStream stream = nullptr; + + aclInit(nullptr); + if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { + deviceId = std::atoi(envDevice); + } + aclrtSetDevice(deviceId); + aclrtCreateStream(&stream); + + for (size_t i = 0; i < kNumCases; ++i) { + if (caseFilter != nullptr && std::strcmp(kCases[i].name, caseFilter) != 0) { + continue; + } + int ret = RunCase(kCases[i], deviceId, stream); + if (ret != 0) { + std::fprintf(stderr, "[ERROR] case %s failed\n", kCases[i].name); + rc = 1; + break; + } + } + + if (stream != nullptr) + aclrtDestroyStream(stream); + aclrtResetDevice(deviceId); + aclFinalize(); + + return rc; +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolprod/tcolprod.pto b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolprod/tcolprod.pto new file mode 100644 index 000000000..d3d585c47 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolprod/tcolprod.pto @@ -0,0 +1,99 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You can not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// TileLang ST kernels for pto.tcolprod: tload(src) + tcolprod(dst, src) + tstore(dst). +// Multiple cases with different shapes in a single module. +// Compiled by ptoas --enable-insert-sync --enable-tile-op-expand --pto-backend=vpto +// to produce a fatobj object. + +module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { + // Case 0: f32 1x256 (input: 1x256, output: 1x256) + func.func @TCOLPROD_f32_1x256(%dst_ptr: !pto.ptr, %src_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c256 = arith.constant 256 : index + %c255 = arith.constant 255 : index + + %src_view = pto.make_tensor_view %src_ptr, + shape = [%c1, %c1, %c1, %c1, %c256], + strides = [%c256, %c256, %c256, %c256, %c1] + : !pto.tensor_view<1x1x1x1x256xf32> + %dst_view = pto.make_tensor_view %dst_ptr, + shape = [%c1, %c1, %c1, %c1, %c256], + strides = [%c256, %c256, %c256, %c256, %c1] + : !pto.tensor_view<1x1x1x1x256xf32> + + %src_part = pto.partition_view %src_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c1, %c255] + : !pto.tensor_view<1x1x1x1x256xf32> -> !pto.partition_tensor_view<1x1x1x1x255xf32> + %dst_part = pto.partition_view %dst_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c1, %c255] + : !pto.tensor_view<1x1x1x1x256xf32> -> !pto.partition_tensor_view<1x1x1x1x255xf32> + + %src = pto.alloc_tile + : !pto.tile_buf + %dst = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%src_part : !pto.partition_tensor_view<1x1x1x1x255xf32>) + outs(%src : !pto.tile_buf) + + pto.tcolprod ins(%src : !pto.tile_buf) + outs(%dst : !pto.tile_buf) + + pto.tstore ins(%dst : !pto.tile_buf) + outs(%dst_part : !pto.partition_tensor_view<1x1x1x1x255xf32>) + return + } + + // Case 1: f32 16x128 (input: 16x128, output: 1x128) + + func.func @TCOLPROD_i16_1x256(%dst_ptr: !pto.ptr, %src_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c256 = arith.constant 256 : index + %c255 = arith.constant 255 : index + + %src_view = pto.make_tensor_view %src_ptr, + shape = [%c1, %c1, %c1, %c1, %c256], + strides = [%c256, %c256, %c256, %c256, %c1] + : !pto.tensor_view<1x1x1x1x256xi16> + %dst_view = pto.make_tensor_view %dst_ptr, + shape = [%c1, %c1, %c1, %c1, %c256], + strides = [%c256, %c256, %c256, %c256, %c1] + : !pto.tensor_view<1x1x1x1x256xi16> + + %src_part = pto.partition_view %src_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c1, %c255] + : !pto.tensor_view<1x1x1x1x256xi16> -> !pto.partition_tensor_view<1x1x1x1x255xi16> + %dst_part = pto.partition_view %dst_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c1, %c255] + : !pto.tensor_view<1x1x1x1x256xi16> -> !pto.partition_tensor_view<1x1x1x1x255xi16> + + %src = pto.alloc_tile + : !pto.tile_buf + %dst = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%src_part : !pto.partition_tensor_view<1x1x1x1x255xi16>) + outs(%src : !pto.tile_buf) + + pto.tcolprod ins(%src : !pto.tile_buf) + outs(%dst : !pto.tile_buf) + + pto.tstore ins(%dst : !pto.tile_buf) + outs(%dst_part : !pto.partition_tensor_view<1x1x1x1x255xi16>) + return + } + + // Case 4: i16 16x128 (input: 16x128, output: 1x128) +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolsum/CMakeLists.txt b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolsum/CMakeLists.txt new file mode 100644 index 000000000..74b427f3c --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolsum/CMakeLists.txt @@ -0,0 +1,9 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +pto_tilelang_vec_st(tcolsum) diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolsum/cases.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolsum/cases.py new file mode 100644 index 000000000..746ad1d34 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolsum/cases.py @@ -0,0 +1,180 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +"""Single source of truth for tcolsum ST test cases. + +Each case defines: + - name: case identifier, used as subdirectory name and by main.cpp kCases[]. + - dtype: numpy dtype (e.g. np.float32). + - shape: (rows, cols) — allocated tile dimensions for input. + - valid_shape: (valid_rows, valid_cols) — effective computation region for input. + - dst_shape: (1, cols) — allocated tile dimensions for output. + - dst_valid_shape: (1, valid_cols) — effective computation region for output. + - eps: tolerance for numpy.allclose (atol and rtol). + +gen_data.py and compare.py both import this list to avoid redundant definitions. +""" + +import numpy as np + +CASES = [ + { + "name": "f32_1x256", + "dtype": np.float32, + "shape": (1, 256), + "valid_shape": (1, 255), + "dst_shape": (1, 256), + "dst_valid_shape": (1, 255), + "eps": 1e-6, + }, + { + "name": "f32_16x128", + "dtype": np.float32, + "shape": (16, 128), + "valid_shape": (16, 127), + "dst_shape": (1, 128), + "dst_valid_shape": (1, 127), + "eps": 1e-6, + }, + { + "name": "f32_16x256", + "dtype": np.float32, + "shape": (16, 256), + "valid_shape": (15, 255), + "dst_shape": (1, 256), + "dst_valid_shape": (1, 255), + "eps": 1e-6, + }, + { + "name": "f32_64x128_1", + "dtype": np.float32, + "shape": (64, 128), + "valid_shape": (63, 127), + "dst_shape": (1, 128), + "dst_valid_shape": (1, 127), + "eps": 1e-6, + }, + { + "name": "f32_64x128_2", + "dtype": np.float32, + "shape": (64, 128), + "valid_shape": (64, 128), + "dst_shape": (1, 128), + "dst_valid_shape": (1, 128), + "eps": 1e-6, + }, + { + "name": "f32_1x512", + "dtype": np.float32, + "shape": (1, 512), + "valid_shape": (1, 511), + "dst_shape": (1, 512), + "dst_valid_shape": (1, 511), + "eps": 1e-6, + }, + { + "name": "f16_1x256", + "dtype": np.float16, + "shape": (1, 256), + "valid_shape": (1, 255), + "dst_shape": (1, 256), + "dst_valid_shape": (1, 255), + "eps": 1e-3, + }, + { + "name": "f16_16x128", + "dtype": np.float16, + "shape": (16, 128), + "valid_shape": (16, 127), + "dst_shape": (1, 128), + "dst_valid_shape": (1, 127), + "eps": 1e-3, + }, + { + "name": "f16_16x256", + "dtype": np.float16, + "shape": (16, 256), + "valid_shape": (15, 255), + "dst_shape": (1, 256), + "dst_valid_shape": (1, 255), + "eps": 1e-3, + }, + { + "name": "f16_64x128_1", + "dtype": np.float16, + "shape": (64, 128), + "valid_shape": (63, 127), + "dst_shape": (1, 128), + "dst_valid_shape": (1, 127), + "eps": 1e-3, + }, + { + "name": "f16_64x128_2", + "dtype": np.float16, + "shape": (64, 128), + "valid_shape": (64, 128), + "dst_shape": (1, 128), + "dst_valid_shape": (1, 128), + "eps": 1e-3, + }, + { + "name": "i8_1x256", + "dtype": np.int8, + "shape": (1, 256), + "valid_shape": (1, 255), + "dst_shape": (1, 256), + "dst_valid_shape": (1, 255), + "eps": 0, + }, + { + "name": "i8_16x128", + "dtype": np.int8, + "shape": (16, 128), + "valid_shape": (16, 127), + "dst_shape": (1, 128), + "dst_valid_shape": (1, 127), + "eps": 0, + }, + { + "name": "i8_16x256", + "dtype": np.int8, + "shape": (16, 256), + "valid_shape": (15, 255), + "dst_shape": (1, 256), + "dst_valid_shape": (1, 255), + "eps": 0, + }, + { + "name": "i8_64x128_1", + "dtype": np.int8, + "shape": (64, 128), + "valid_shape": (63, 127), + "dst_shape": (1, 128), + "dst_valid_shape": (1, 127), + "eps": 0, + }, + { + "name": "i8_64x128_2", + "dtype": np.int8, + "shape": (64, 128), + "valid_shape": (64, 128), + "dst_shape": (1, 128), + "dst_valid_shape": (1, 128), + "eps": 0, + }, +] + +_SMOKE_CASE_NAMES = ['f32_1x256', 'f16_1x256'] +_SMOKE_CASE_NAME_SET = set(_SMOKE_CASE_NAMES) +_missing = [name for name in _SMOKE_CASE_NAMES if name not in {case["name"] for case in CASES}] +if _missing: + raise RuntimeError("unknown smoke case(s): " + ", ".join(_missing)) +CASES = [case for case in CASES if case["name"] in _SMOKE_CASE_NAME_SET] diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolsum/compare.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolsum/compare.py new file mode 100644 index 000000000..040ae0bb8 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolsum/compare.py @@ -0,0 +1,50 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import os +import sys +import numpy as np + +from cases import CASES +from st_common import result_cmp, style_fail, style_pass, validate_cases + + +def main(): + validate_cases(CASES) + case_filter = sys.argv[1] if len(sys.argv) > 1 else None + + all_passed = True + for case in CASES: + if case_filter is not None and case["name"] != case_filter: + continue + + case_dir = case["name"] + dst_shape = case["dst_shape"] + dst_valid_shape = case["dst_valid_shape"] + vr, vc = dst_valid_shape + + golden = np.fromfile(os.path.join(case_dir, "golden.bin"), dtype=case["dtype"]).reshape(dst_shape) + output = np.fromfile(os.path.join(case_dir, "output.bin"), dtype=case["dtype"]).reshape(dst_shape) + + ok = result_cmp(golden[:vr, :vc], output[:vr, :vc], case["eps"]) + if ok: + print(style_pass(f"[INFO] {case['name']}: compare passed")) + else: + print(style_fail(f"[ERROR] {case['name']}: compare failed")) + all_passed = False + + if not all_passed: + sys.exit(2) + print(style_pass("[INFO] all cases passed")) + + +if __name__ == "__main__": + main() diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolsum/gen_data.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolsum/gen_data.py new file mode 100644 index 000000000..70e4eb968 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolsum/gen_data.py @@ -0,0 +1,35 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import numpy as np +from cases import CASES +from st_common import validate_cases, setup_case_rng, save_case_data + +validate_cases(CASES) + +for case in CASES: + setup_case_rng(case) + + dtype = case["dtype"] + shape = case["shape"] + valid_shape = case["valid_shape"] + dst_shape = case["dst_shape"] + dst_valid_shape = case["dst_valid_shape"] + + input1 = np.random.randint(1, 10, size=shape).astype(dtype) + + vr, vc = valid_shape + golden = np.zeros(dst_shape, dtype=dtype) + golden_result = np.sum(input1[:vr, :vc], axis=0, keepdims=True).astype(dtype) + golden[:1, :vc] = golden_result + + save_case_data(case["name"], {"input": input1, "golden": golden}) + print(f"[INFO] gen_data: {case['name']} shape={shape} valid_shape={valid_shape} dst_shape={dst_shape} dtype={dtype.__name__}") diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolsum/launch.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolsum/launch.cpp new file mode 100644 index 000000000..9583499db --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolsum/launch.cpp @@ -0,0 +1,28 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +#include + +#ifndef AICORE +#define AICORE [aicore] +#endif + +// Case 0: f32 1x256 (input: 1x256, output: 1x256) + +extern "C" __global__ AICORE void TCOLSUM_f32_1x256(__gm__ float *dst, __gm__ float *src); +extern "C" __global__ AICORE void TCOLSUM_f16_1x256(__gm__ half *dst, __gm__ half *src); + +void LaunchTCOLSUM_f16_1x256(void *dst, void *src, void *stream) { + TCOLSUM_f16_1x256<<<1, nullptr, stream>>>((__gm__ half *)dst, (__gm__ half *)src); +} + + + +void LaunchTCOLSUM_f32_1x256(float *dst, float *src, void *stream) { + TCOLSUM_f32_1x256<<<1, nullptr, stream>>>((__gm__ float *)dst, (__gm__ float *)src); +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolsum/main.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolsum/main.cpp new file mode 100644 index 000000000..683b4f398 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolsum/main.cpp @@ -0,0 +1,153 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// Host driver for TileLang tcolsum ST — case-table driven. +// Each case launches a different kernel variant, reads/writes from per-case subdirectory. +// Numerical comparison is done externally by compare.py. + +#include "acl/acl.h" +#include "test_common.h" +#include +#include +#include +#include +#include +#include + +using namespace PtoTestCommon; + +// Kernel launch wrappers (defined in launch.cpp) +void LaunchTCOLSUM_f32_1x256(float *dst, float *src, void *stream); +void LaunchTCOLSUM_f32_16x128(float *dst, float *src, void *stream); +void LaunchTCOLSUM_f32_64x128_1(float *dst, float *src, void *stream); +void LaunchTCOLSUM_f32_1x512(float *dst, float *src, void *stream); +void LaunchTCOLSUM_f16_1x256(void *dst, void *src, void *stream); +void LaunchTCOLSUM_f16_16x128(void *dst, void *src, void *stream); +void LaunchTCOLSUM_f16_64x128_1(void *dst, void *src, void *stream); +void LaunchTCOLSUM_i8_1x256(void *dst, void *src, void *stream); +void LaunchTCOLSUM_i8_16x256(void *dst, void *src, void *stream); +void LaunchTCOLSUM_i8_64x128_2(void *dst, void *src, void *stream); + +using LaunchFnFloat = void (*)(float *, float *, void *); +using LaunchFnVoid = void (*)(void *, void *, void *); + +struct TestCase { + const char *name; + void *launch; + size_t srcRows; + size_t srcCols; + size_t srcValidRows; + size_t srcValidCols; + size_t dstRows; + size_t dstCols; + size_t dstValidCols; + size_t elemSize; + bool isFp16; +}; + +static const TestCase kCases[] = { +{"f32_1x256", (void*)LaunchTCOLSUM_f32_1x256, 1, 256, 1, 255, 1, 256, 255, sizeof(float), false}, +{"f16_1x256", (void*)LaunchTCOLSUM_f16_1x256, 1, 256, 1, 255, 1, 256, 255, 2, true}, +}; +static constexpr size_t kNumCases = sizeof(kCases) / sizeof(kCases[0]); + +static int RunCase(const TestCase &tc, int deviceId, aclrtStream stream) { + int rc = 0; + const size_t srcElemCount = tc.srcRows * tc.srcCols; + const size_t srcFileSize = srcElemCount * tc.elemSize; + const size_t dstElemCount = tc.dstRows * tc.dstCols; + const size_t dstFileSize = dstElemCount * tc.elemSize; + + std::printf("[INFO] === case: %s (src=%zux%zu, dst=%zux%zu, fp16=%d) ===\n", + tc.name, tc.srcRows, tc.srcCols, tc.dstRows, tc.dstCols, tc.isFp16); + + std::string caseDir = std::string("./") + tc.name; + size_t srcFileSizeVar = srcFileSize; + size_t dstFileSizeVar = dstFileSize; + + void *srcHost = nullptr, *dstHost = nullptr; + void *srcDevice = nullptr, *dstDevice = nullptr; + + aclrtMallocHost(&srcHost, srcFileSize); + aclrtMallocHost(&dstHost, dstFileSize); + + aclrtMalloc(&srcDevice, srcFileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc(&dstDevice, dstFileSize, ACL_MEM_MALLOC_HUGE_FIRST); + + if (!ReadFile((caseDir + "/input.bin").c_str(), srcFileSizeVar, srcHost, srcFileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input.bin\n", caseDir.c_str()); + rc = 1; + } + + if (rc == 0) { + aclrtMemcpy(srcDevice, srcFileSize, srcHost, srcFileSize, ACL_MEMCPY_HOST_TO_DEVICE); + + if (tc.isFp16) { + LaunchFnVoid launch = (LaunchFnVoid)tc.launch; + launch(dstDevice, srcDevice, stream); + } else { + LaunchFnFloat launch = (LaunchFnFloat)tc.launch; + launch((float*)dstDevice, (float*)srcDevice, stream); + } + + aclrtSynchronizeStream(stream); + aclrtMemcpy(dstHost, dstFileSize, dstDevice, dstFileSize, ACL_MEMCPY_DEVICE_TO_HOST); + } + + if (rc == 0 && !WriteFile((caseDir + "/output.bin").c_str(), dstHost, dstFileSizeVar)) { + std::fprintf(stderr, "[ERROR] failed to write %s/output.bin\n", caseDir.c_str()); + rc = 1; + } + + if (srcDevice != nullptr) + aclrtFree(srcDevice); + if (dstDevice != nullptr) + aclrtFree(dstDevice); + if (srcHost != nullptr) + aclrtFreeHost(srcHost); + if (dstHost != nullptr) + aclrtFreeHost(dstHost); + + if (rc == 0) + std::printf("[INFO] case %s done\n", tc.name); + return rc; +} + +int main(int argc, char *argv[]) { + const char *caseFilter = (argc > 1) ? argv[1] : nullptr; + + int rc = 0; + int deviceId = 0; + aclrtStream stream = nullptr; + + aclInit(nullptr); + if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { + deviceId = std::atoi(envDevice); + } + aclrtSetDevice(deviceId); + aclrtCreateStream(&stream); + + for (size_t i = 0; i < kNumCases; ++i) { + if (caseFilter != nullptr && std::strcmp(kCases[i].name, caseFilter) != 0) { + continue; + } + int ret = RunCase(kCases[i], deviceId, stream); + if (ret != 0) { + std::fprintf(stderr, "[ERROR] case %s failed\n", kCases[i].name); + rc = 1; + break; + } + } + + if (stream != nullptr) + aclrtDestroyStream(stream); + aclrtResetDevice(deviceId); + aclFinalize(); + + return rc; +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolsum/tcolsum.pto b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolsum/tcolsum.pto new file mode 100644 index 000000000..e5ef3bbd5 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcolsum/tcolsum.pto @@ -0,0 +1,99 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You can not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// TileLang ST kernels for pto.tcolsum: tload(src) + tcolsum(dst, src) + tstore(dst). +// Multiple cases with different shapes in a single module. +// Compiled by ptoas --enable-insert-sync --enable-tile-op-expand --pto-backend=vpto +// to produce a fatobj object. + +module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { + // Case 0: f32 1x256 (input: 1x256, output: 1x256) + func.func @TCOLSUM_f32_1x256(%dst_ptr: !pto.ptr, %src_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c256 = arith.constant 256 : index + %c255 = arith.constant 255 : index + + %src_view = pto.make_tensor_view %src_ptr, + shape = [%c1, %c1, %c1, %c1, %c256], + strides = [%c256, %c256, %c256, %c256, %c1] + : !pto.tensor_view<1x1x1x1x256xf32> + %dst_view = pto.make_tensor_view %dst_ptr, + shape = [%c1, %c1, %c1, %c1, %c256], + strides = [%c256, %c256, %c256, %c256, %c1] + : !pto.tensor_view<1x1x1x1x256xf32> + + %src_part = pto.partition_view %src_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c1, %c255] + : !pto.tensor_view<1x1x1x1x256xf32> -> !pto.partition_tensor_view<1x1x1x1x255xf32> + %dst_part = pto.partition_view %dst_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c1, %c255] + : !pto.tensor_view<1x1x1x1x256xf32> -> !pto.partition_tensor_view<1x1x1x1x255xf32> + + %src = pto.alloc_tile + : !pto.tile_buf + %dst = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%src_part : !pto.partition_tensor_view<1x1x1x1x255xf32>) + outs(%src : !pto.tile_buf) + + pto.tcolsum ins(%src : !pto.tile_buf) + outs(%dst : !pto.tile_buf) + + pto.tstore ins(%dst : !pto.tile_buf) + outs(%dst_part : !pto.partition_tensor_view<1x1x1x1x255xf32>) + return + } + + // Case 1: f32 16x128 (input: 16x128, output: 1x128) + + func.func @TCOLSUM_f16_1x256(%dst_ptr: !pto.ptr, %src_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c256 = arith.constant 256 : index + %c255 = arith.constant 255 : index + + %src_view = pto.make_tensor_view %src_ptr, + shape = [%c1, %c1, %c1, %c1, %c256], + strides = [%c256, %c256, %c256, %c256, %c1] + : !pto.tensor_view<1x1x1x1x256xf16> + %dst_view = pto.make_tensor_view %dst_ptr, + shape = [%c1, %c1, %c1, %c1, %c256], + strides = [%c256, %c256, %c256, %c256, %c1] + : !pto.tensor_view<1x1x1x1x256xf16> + + %src_part = pto.partition_view %src_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c1, %c255] + : !pto.tensor_view<1x1x1x1x256xf16> -> !pto.partition_tensor_view<1x1x1x1x255xf16> + %dst_part = pto.partition_view %dst_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c1, %c255] + : !pto.tensor_view<1x1x1x1x256xf16> -> !pto.partition_tensor_view<1x1x1x1x255xf16> + + %src = pto.alloc_tile + : !pto.tile_buf + %dst = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%src_part : !pto.partition_tensor_view<1x1x1x1x255xf16>) + outs(%src : !pto.tile_buf) + + pto.tcolsum ins(%src : !pto.tile_buf) + outs(%dst : !pto.tile_buf) + + pto.tstore ins(%dst : !pto.tile_buf) + outs(%dst_part : !pto.partition_tensor_view<1x1x1x1x255xf16>) + return + } + + // Case 7: f16 16x128 (input: 16x128, output: 1x128) +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcvt/CMakeLists.txt b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcvt/CMakeLists.txt new file mode 100644 index 000000000..b117e9a27 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcvt/CMakeLists.txt @@ -0,0 +1,9 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +pto_tilelang_vec_st(tcvt) diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcvt/cases.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcvt/cases.py new file mode 100644 index 000000000..ec2ce8367 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcvt/cases.py @@ -0,0 +1,174 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +"""Single source of truth for tcvt ST test cases. + +`dtype` is kept for shared validation compatibility. +Actual data generation and comparison use `src_dtype` / `dst_dtype`. +""" + +import numpy as np +from ml_dtypes import bfloat16 + +# 7 shapes (aligning with C++ INSTANTIATE_TCVT) +SHAPES = [ + (1, 128, 1, 128), + (2, 64, 2, 64), + (4, 32, 4, 32), + (2, 128, 2, 128), + (4, 128, 4, 65), # Partial tiles + (4, 256, 4, 200), # Partial tiles + (1, 256, 1, 129), # Partial tiles +] + +_DTYPE_NAME = { + np.float32: "f32", + np.float16: "f16", + bfloat16: "bf16", + np.int8: "si8", + np.uint8: "ui8", + np.int16: "i16", + "si16": "si16", + np.uint16: "ui16", + np.int32: "i32", + np.uint32: "ui32", + np.int64: "i64", + np.uint64: "ui64", +} + + +def _make_cases(src_dtype, dst_dtype): + """Generate cases of 7 test shapes for src_dtype -> dst_dtype""" + src_name = _DTYPE_NAME.get(src_dtype, src_dtype) + dst_name = _DTYPE_NAME.get(dst_dtype, dst_dtype) + + # eps: f32=1e-6; f16/bf16=1e-3; others=0 + eps_map = {np.float32: 1e-6, np.float16: 1e-3, bfloat16: 1e-3} + eps = eps_map.get(dst_dtype, 0.0) + + cases = [] + for rows, cols, v_rows, v_cols in SHAPES: + shape_name = f"{rows}x{cols}" if v_cols == cols else f"{v_rows}x{v_cols}" + cases.append({ + "name": f"{src_name}_to_{dst_name}_{shape_name}", + "dtype": dst_dtype, + "src_dtype": src_dtype, + "dst_dtype": dst_dtype, + "shape": (rows, cols), + "valid_shape": (v_rows, v_cols), + "eps": eps, + }) + return cases + + +CASES = [ + # f32 -> f16 smoke case starts at 1x128. + *_make_cases(np.float32, np.float16), + { + "name": "f32_to_i32_rint_16x64", + "dtype": np.int32, + "src_dtype": np.float32, + "dst_dtype": np.int32, + "shape": (16, 64), + "valid_shape": (16, 64), + "round_mode": "RINT", + "eps": 0.0, + }, + { + "name": "f32_to_i32_round_16x64", + "dtype": np.int32, + "src_dtype": np.float32, + "dst_dtype": np.int32, + "shape": (16, 64), + "valid_shape": (16, 64), + "round_mode": "ROUND", + "eps": 0.0, + }, + { + "name": "i32_to_f32_rint_16x64", + "dtype": np.float32, + "src_dtype": np.int32, + "dst_dtype": np.float32, + "shape": (16, 64), + "valid_shape": (16, 64), + "round_mode": "RINT", + "eps": 1e-6, + }, + { + "name": "f32_to_f16_rint_16x64", + "dtype": np.float16, + "src_dtype": np.float32, + "dst_dtype": np.float16, + "shape": (16, 64), + "valid_shape": (16, 64), + "round_mode": "RINT", + "eps": 1e-3, + }, + { + "name": "f16_to_f32_rint_16x64", + "dtype": np.float32, + "src_dtype": np.float16, + "dst_dtype": np.float32, + "shape": (16, 64), + "valid_shape": (16, 64), + "round_mode": "RINT", + "eps": 1e-6, + }, + # f32 → bf16, i16, i32, i64, f32 + *_make_cases(np.float32, bfloat16), + *_make_cases(np.float32, np.int16), + *_make_cases(np.float32, np.int32), + *_make_cases(np.float32, np.int64), + *_make_cases(np.float32, np.float32), + # f16 → f32, i32, i16, si8, ui8 + *_make_cases(np.float16, np.float32), + *_make_cases(np.float16, np.int32), + *_make_cases(np.float16, np.int16), + *_make_cases(np.float16, np.int8), + *_make_cases(np.float16, np.uint8), + # bf16 → f32, f16, i32 + *_make_cases(bfloat16, np.float32), + *_make_cases(bfloat16, np.float16), + *_make_cases(bfloat16, np.int32), + # ui8 → f16, ui16 + *_make_cases(np.uint8, np.float16), + *_make_cases(np.uint8, np.uint16), + # si8 → f16, si16, i32 + *_make_cases(np.int8, np.float16), + *_make_cases(np.int8, "si16"), + *_make_cases(np.int8, np.int32), + # i16 → ui8, f16, f32, ui32, i32 + *_make_cases(np.int16, np.uint8), + *_make_cases(np.int16, np.float16), + *_make_cases(np.int16, np.float32), + *_make_cases(np.int16, np.uint32), + *_make_cases(np.int16, np.int32), + # i32 → f32, i16, i64, ui8, ui16 + *_make_cases(np.int32, np.float32), + *_make_cases(np.int32, np.int16), + *_make_cases(np.int32, np.int64), + *_make_cases(np.int32, np.uint8), + *_make_cases(np.int32, np.uint16), + # ui32 → i16, ui16, ui8 + *_make_cases(np.uint32, np.int16), + *_make_cases(np.uint32, np.uint16), + *_make_cases(np.uint32, np.uint8), + # i64 → f32, i32 + *_make_cases(np.int64, np.float32), + *_make_cases(np.int64, np.int32), +] + +_SMOKE_CASE_NAMES = ['f32_to_f16_1x128', 'f16_to_f32_1x129', 'bf16_to_i32_1x128', 'ui8_to_ui16_1x128'] +_SMOKE_CASE_NAME_SET = set(_SMOKE_CASE_NAMES) +_missing = [name for name in _SMOKE_CASE_NAMES if name not in {case["name"] for case in CASES}] +if _missing: + raise RuntimeError("unknown smoke case(s): " + ", ".join(_missing)) +CASES = [case for case in CASES if case["name"] in _SMOKE_CASE_NAME_SET] diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcvt/compare.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcvt/compare.py new file mode 100644 index 000000000..f3468d12b --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcvt/compare.py @@ -0,0 +1,57 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import os +import sys +import numpy as np + +from cases import CASES +from st_common import result_cmp, style_fail, style_pass, validate_cases + + +_STR_DTYPE_MAP = {"si16": np.int16} + +def normalize_dtype(dtype): + return _STR_DTYPE_MAP.get(dtype, dtype) + + +def main(): + validate_cases(CASES) + case_filter = sys.argv[1] if len(sys.argv) > 1 else None + + all_passed = True + for case in CASES: + if case_filter is not None and case["name"] != case_filter: + continue + + case_dir = case["name"] + shape = case["shape"] + vr, vc = case["valid_shape"] + dst_dtype = case["dst_dtype"] + dst_dtype = normalize_dtype(dst_dtype) + + golden = np.fromfile(os.path.join(case_dir, "golden.bin"), dtype=dst_dtype).reshape(shape) + output = np.fromfile(os.path.join(case_dir, "output.bin"), dtype=dst_dtype).reshape(shape) + + ok = result_cmp(golden[:vr, :vc], output[:vr, :vc], case["eps"]) + if ok: + print(style_pass(f"[INFO] {case['name']}: compare passed")) + else: + print(style_fail(f"[ERROR] {case['name']}: compare failed")) + all_passed = False + + if not all_passed: + sys.exit(2) + print(style_pass("[INFO] all cases passed")) + + +if __name__ == "__main__": + main() diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcvt/gen_cpp.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcvt/gen_cpp.py new file mode 100644 index 000000000..0b0ab589c --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcvt/gen_cpp.py @@ -0,0 +1,234 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +"""Script to generate launch.cpp & main.cpp""" + +import numpy as np +import cases +from cases import bfloat16 + +_DTYPE_TO_CPP = { + np.float32: "float", + np.float16: "uint16_t", + bfloat16: "uint16_t", + np.int8: "int8_t", + np.uint8: "uint8_t", + np.int16: "int16_t", + "si16": "int16_t", + np.uint16: "uint16_t", + np.int32: "int32_t", + np.uint32: "uint32_t", + np.int64: "int64_t", + np.uint64: "uint64_t", +} + +def gen_launch(): + lines = [ + "// Copyright (c) 2026 Huawei Technologies Co., Ltd.", + "// This program is free software, you can redistribute it and/or modify it under the terms and conditions of", + '// CANN Open Software License Agreement Version 2.0 (the "License").', + "// Please refer to the License for details. You may not use this file except in compliance with the License.", + '// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,', + "// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.", + "// See LICENSE in the root of the software repository for the full text of the License.", + "", + "#include ", + "", + "#ifndef AICORE", + "#define AICORE [aicore]", + "#endif", + "", + ] + + extern_decls = [] + launch_funcs = [] + + for c in cases.CASES: + name = c["name"] + src_cpp = _DTYPE_TO_CPP.get(c["src_dtype"], "float") + dst_cpp = _DTYPE_TO_CPP.get(c["dst_dtype"], "float") + + extern_decls.append(f'extern "C" __global__ AICORE void TCVT_{name}(__gm__ {src_cpp} *src, __gm__ {dst_cpp} *dst);') + launch_funcs.append(f"void LaunchTCVT_{name}(void *src, void *dst, void *stream) {{") + launch_funcs.append(f" TCVT_{name}<<<1, nullptr, stream>>>((__gm__ {src_cpp} *)src, (__gm__ {dst_cpp} *)dst);") + launch_funcs.append("}") + launch_funcs.append("") + + lines.extend(extern_decls) + lines.append("") + lines.extend(launch_funcs) + + return "\n".join(lines) + +def gen_main(): + lines = [ + "// Copyright (c) 2026 Huawei Technologies Co., Ltd.", + "// This program is free software, you can redistribute it and/or modify it under the terms and conditions of", + '// CANN Open Software License Agreement Version 2.0 (the "License").', + "// Please refer to the License for details. You may not use this file except in compliance with the License.", + '// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,', + "// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.", + "// See LICENSE in the root of the software repository for the full text of the License.", + "", + '#include "acl/acl.h"', + '#include "test_common.h"', + "#include ", + "#include ", + "#include ", + "#include ", + "#include ", + "", + "using namespace PtoTestCommon;", + "", + ] + + decls = [] + for c in cases.CASES: + decls.append(f"void LaunchTCVT_{c['name']}(void *src, void *dst, void *stream);") + + lines.extend(decls) + lines.extend([ + "", + "using LaunchFn = void (*)(void *, void *, void *);", + "", + "struct TestCase {", + " const char *name;", + " LaunchFn launch;", + " size_t srcRows;", + " size_t srcCols;", + " size_t dstRows;", + " size_t dstCols;", + " size_t srcElemSize;", + " size_t dstElemSize;", + "};", + "", + "static const TestCase kCases[] = {", + ]) + + case_entries = [] + for c in cases.CASES: + name = c["name"] + src_cpp = _DTYPE_TO_CPP.get(c["src_dtype"], "float") + dst_cpp = _DTYPE_TO_CPP.get(c["dst_dtype"], "float") + rows, cols = c["shape"] + case_entries.append(f' {{"{name}", LaunchTCVT_{name}, {rows}, {cols}, {rows}, {cols}, sizeof({src_cpp}), sizeof({dst_cpp})}},') + + lines.extend(case_entries) + lines.extend([ + "};", + "static constexpr size_t kNumCases = sizeof(kCases) / sizeof(kCases[0]);", + "", + ]) + + # RunCase 和 main 函数保持不变 + lines.extend([ + "static int RunCase(const TestCase &tc, int deviceId, aclrtStream stream) {", + " (void)deviceId;", + " int rc = 0;", + " const size_t srcElemCount = tc.srcRows * tc.srcCols;", + " const size_t dstElemCount = tc.dstRows * tc.dstCols;", + " size_t srcFileSize = srcElemCount * tc.srcElemSize;", + " size_t dstFileSize = dstElemCount * tc.dstElemSize;", + "", + ' std::printf("[INFO] === case: %s (src=%zux%zu, dst=%zux%zu) ===\\n",', + " tc.name, tc.srcRows, tc.srcCols, tc.dstRows, tc.dstCols);", + "", + ' std::string caseDir = std::string("./") + tc.name;', + "", + " void *srcHost = nullptr;", + " void *dstHost = nullptr;", + " void *srcDevice = nullptr;", + " void *dstDevice = nullptr;", + "", + " aclrtMallocHost(&srcHost, srcFileSize);", + " aclrtMallocHost(&dstHost, dstFileSize);", + "", + " aclrtMalloc(&srcDevice, srcFileSize, ACL_MEM_MALLOC_HUGE_FIRST);", + " aclrtMalloc(&dstDevice, dstFileSize, ACL_MEM_MALLOC_HUGE_FIRST);", + "", + ' if (!ReadFile((caseDir + "/input.bin").c_str(), srcFileSize, srcHost, srcFileSize)) {', + ' std::fprintf(stderr, "[ERROR] failed to read %s/input.bin\\n", caseDir.c_str());', + " rc = 1;", + " }", + "", + " if (rc == 0) {", + " aclrtMemcpy(srcDevice, srcFileSize, srcHost, srcFileSize, ACL_MEMCPY_HOST_TO_DEVICE);", + " tc.launch(srcDevice, dstDevice, stream);", + " aclrtSynchronizeStream(stream);", + " aclrtMemcpy(dstHost, dstFileSize, dstDevice, dstFileSize, ACL_MEMCPY_DEVICE_TO_HOST);", + " }", + "", + ' if (rc == 0 && !WriteFile((caseDir + "/output.bin").c_str(), dstHost, dstFileSize)) {', + ' std::fprintf(stderr, "[ERROR] failed to write %s/output.bin\\n", caseDir.c_str());', + " rc = 1;", + " }", + "", + " if (srcDevice != nullptr)", + " aclrtFree(srcDevice);", + " if (dstDevice != nullptr)", + " aclrtFree(dstDevice);", + " if (srcHost != nullptr)", + " aclrtFreeHost(srcHost);", + " if (dstHost != nullptr)", + " aclrtFreeHost(dstHost);", + "", + " if (rc == 0)", + ' std::printf("[INFO] case %s done\\n", tc.name);', + " return rc;", + "}", + "", + "int main(int argc, char *argv[]) {", + " const char *caseFilter = (argc > 1) ? argv[1] : nullptr;", + "", + " int rc = 0;", + " int deviceId = 0;", + " aclrtStream stream = nullptr;", + "", + " aclInit(nullptr);", + ' if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) {', + " deviceId = std::atoi(envDevice);", + " }", + " aclrtSetDevice(deviceId);", + " aclrtCreateStream(&stream);", + "", + " for (size_t i = 0; i < kNumCases; ++i) {", + " if (caseFilter != nullptr && std::strcmp(kCases[i].name, caseFilter) != 0) {", + " continue;", + " }", + " int ret = RunCase(kCases[i], deviceId, stream);", + " if (ret != 0) {", + ' std::fprintf(stderr, "[ERROR] case %s failed\\n", kCases[i].name);', + " rc = 1;", + " break;", + " }", + " }", + "", + " if (stream != nullptr)", + " aclrtDestroyStream(stream);", + " aclrtResetDevice(deviceId);", + " aclFinalize();", + "", + " return rc;", + "}", + "" + ]) + + return "\n".join(lines) + +if __name__ == "__main__": + from pathlib import Path + HERE = Path(__file__).parent + + with open(HERE / "launch.cpp", "w") as f: + f.write(gen_launch()) + print(f"Generated {(HERE / 'launch.cpp').as_posix()!r}") + + with open(HERE / "main.cpp", "w") as f: + f.write(gen_main()) + print(f"Generated {(HERE / 'main.cpp').as_posix()!r}") diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcvt/gen_data.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcvt/gen_data.py new file mode 100644 index 000000000..d4ef64985 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcvt/gen_data.py @@ -0,0 +1,180 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import numpy as np +import ml_dtypes + +from cases import CASES +from compare import normalize_dtype +from st_common import save_case_data, setup_case_rng, validate_cases + + +def is_sub_float(dtype): + return np.issubdtype(dtype, np.floating) or dtype == ml_dtypes.bfloat16 + + +def is_sub_int(dtype): + return np.issubdtype(dtype, np.integer) + + +def _make_input_inner(src_dtype, shape): + total = int(np.prod(shape)) + float_types = (np.float32, np.float16, ml_dtypes.bfloat16) + int8_like_types = (np.int8, ) + + # Generate input data + if src_dtype in float_types: + return (np.random.random([total]) * 200 - 100) + elif src_dtype in int8_like_types: + return np.random.randint(-128, 128, [total]) + elif src_dtype == np.uint8: + return np.random.randint(0, 256, [total]) + elif src_dtype == np.int16: + return np.random.randint(-1000, 1000, [total]) + elif src_dtype == np.uint16: + return np.random.randint(0, 10000, [total]) + elif src_dtype in (np.int32, np.int64): + return np.random.randint(-10000, 10000, [total]) + elif src_dtype == np.uint32: + return np.random.randint(0, 10000, [total]) + else: + return np.random.randint(-10000, 10000, [total]) + + +def make_input(src_dtype, shape): + return _make_input_inner(src_dtype, shape).astype(normalize_dtype(src_dtype)).reshape(shape) + + +def round_half_away_from_zero(values): + return np.copysign(np.floor(np.abs(values) + 0.5), values) + + +def default_saturation_off(src_dtype, dst_dtype): + """Mirror the current A5 default saturation policy for supported pairs.""" + return ( + (src_dtype is np.float16 and dst_dtype is np.uint8) + or (src_dtype is np.float16 and dst_dtype is np.int8) + or (src_dtype is np.float32 and dst_dtype is np.int16) + or (src_dtype is np.float16 and dst_dtype is np.int16) + or (src_dtype is np.int64 and dst_dtype is np.int32) + or (src_dtype is np.int32 and dst_dtype is np.int16) + ) + + +def apply_round_mode(values, round_mode): + rounding_funcs = { + "RINT": np.rint, + "ROUND": round_half_away_from_zero, + "FLOOR": np.floor, + "CEIL": np.ceil, + "TRUNC": np.trunc, + } + return rounding_funcs.get(round_mode, np.rint)(values) + + +def convert(values: np.ndarray, src_dtype, dst_dtype, round_mode=None): + is_float_src = is_sub_float(src_dtype) + is_int_dst = is_sub_int(dst_dtype) + is_f32_to_f32 = src_dtype == np.float32 and dst_dtype == np.float32 + needs_rounding = is_float_src and (is_int_dst or is_f32_to_f32) + + if needs_rounding: + values = apply_round_mode(values, round_mode or "RINT") + + if is_int_dst: + # Determine if this conversion has default saturation OFF (truncation) or ON (clamping) + if default_saturation_off(src_dtype, dst_dtype): + # OFF (truncation): bit extraction - wrap around using modulo + return truncate_to_int(values, dst_dtype) + else: + # Saturation ON: clamp to range (widen to int64/float64 to preserve sign) + return clamp_to_range_int(values, dst_dtype) + elif is_sub_float(dst_dtype): + return clamp_to_range_float(values, dst_dtype) + else: + return values.astype(dst_dtype) + + +def truncate_to_int(values: np.ndarray, dst_dtype): + golden_list = [] + for val in values.flat: + int_val = 0 if np.isnan(val) or np.isinf(val) else int(np.int64(val)) + + if dst_dtype == np.int8: + byte_val = int_val & 0xFF + truncated_val = byte_val if byte_val < 128 else byte_val - 256 + elif dst_dtype == np.uint8: + truncated_val = int_val & 0xFF + elif dst_dtype == np.int16: + word_val = int_val & 0xFFFF + truncated_val = word_val if word_val < 32768 else word_val - 65536 + elif dst_dtype == np.int32: + dword_val = int_val & 0xFFFFFFFF + truncated_val = dword_val if dword_val < 2147483648 else dword_val - 4294967296 + else: + truncated_val = int_val + golden_list.append(truncated_val) + return np.array(golden_list, dtype=dst_dtype).reshape(values.shape) + + +def clamp_to_range_int(values: np.ndarray, dst_dtype): + info = ml_dtypes.iinfo(dst_dtype) + is_int_type = is_sub_int(values.dtype) + temp_dtype = np.int64 if is_int_type else np.float64 + widened = values.astype(temp_dtype, copy=False) + return np.clip(widened, info.min, info.max).astype(dst_dtype) + + +def clamp_to_range_float(values: np.ndarray, dst_dtype): + info = ml_dtypes.finfo(dst_dtype) + return np.clip(values, info.min, info.max).astype(dst_dtype) + + +def apply_valid_shape(values: np.ndarray, valid_shape, dst_dtype): + vr, vc = valid_shape + masked = np.zeros_like(values, dtype=dst_dtype) + masked[:vr, :vc] = values[:vr, :vc] + return masked + +def generate_golden(case): + src_dtype = case["src_dtype"] + dst_dtype = case["dst_dtype"] + src_dtype_norm = normalize_dtype(src_dtype) + dst_dtype_norm = normalize_dtype(dst_dtype) + shape = case["shape"] + round_mode = case.get("round_mode") + + input_arr = make_input(src_dtype, shape) + converted = convert(input_arr, src_dtype_norm, dst_dtype_norm, round_mode) + golden = apply_valid_shape(converted, case["valid_shape"], dst_dtype_norm) + + return input_arr, golden + + +if __name__ == "__main__": + np.random.seed(19) + + validate_cases(CASES) + + for case in CASES: + setup_case_rng(case) + input_arr, golden = generate_golden(case) + + save_case_data(case["name"], {"input": input_arr, "golden": golden}) + src_dtype = case["src_dtype"] + dst_dtype = case["dst_dtype"] + src_name = src_dtype.__name__ if isinstance(src_dtype, type) else src_dtype + dst_name = dst_dtype.__name__ if isinstance(dst_dtype, type) else dst_dtype + print( + f"[INFO] gen_data: {case['name']} shape={case['shape']} " + f"src_dtype={src_name} dst_dtype={dst_name} " + f"round_mode={case.get('round_mode')}" + ) diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcvt/gen_tcvt_pto.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcvt/gen_tcvt_pto.py new file mode 100644 index 000000000..0ec100127 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcvt/gen_tcvt_pto.py @@ -0,0 +1,114 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +"""Script to generate tcvt.pto""" + +import cases + +def gen_rmode_attr(rmode): + return f"#pto" + +def gen_kernel(case, idx=0): + src_dtype = cases._DTYPE_NAME.get(case["src_dtype"], case["src_dtype"]) + dst_dtype = cases._DTYPE_NAME.get(case["dst_dtype"], case["dst_dtype"]) + rows, cols = case["shape"] + v_rows, v_cols = case["valid_shape"] + + shape_suffix = f"{rows}x{cols}" if v_cols == cols else f"{v_rows}x{v_cols}" + kernel_name = f"TCVT_{src_dtype}_to_{dst_dtype}_{shape_suffix}" + + rmode = "RINT" + rmode_command = "default RINT" + if "round_mode" in case: + rmode = case['round_mode'] + kernel_name = f"TCVT_{src_dtype}_to_{dst_dtype}_{rmode.lower()}_{shape_suffix}" + if rmode != "RINT": + rmode_command = f"explicit {rmode}" + + stride = rows * cols + + tile_valid = "" if v_rows == rows and v_cols == cols else f", valid={v_rows}x{v_cols}" + tile_src = f"!pto.tile_buf" + tile_dst = f"!pto.tile_buf" + + const_vals = sorted(set([0, 1, rows, cols, v_rows, v_cols, stride])) + longest_const = len(str(const_vals[-1])) + const_defs = [f" %c{i:<{longest_const}} = arith.constant {i:<{longest_const}} : index" for i in const_vals] + + lines = [ + f" // Case {idx}: {src_dtype} -> {dst_dtype}, {rmode_command}", + f" func.func @{kernel_name}(%src_ptr: !pto.ptr<{src_dtype}>, %dst_ptr: !pto.ptr<{dst_dtype}>) attributes {{ pto.entry }} {{", + ] + lines.extend(const_defs) + lines.extend([ + "", + f" %src_view = pto.make_tensor_view %src_ptr,", + f" shape = [%c1, %c1, %c1, %c{rows}, %c{cols}],", + f" strides = [%c{stride}, %c{stride}, %c{stride}, %c{cols}, %c1]", + f" : !pto.tensor_view<1x1x1x{rows}x{cols}x{src_dtype}>", + f" %dst_view = pto.make_tensor_view %dst_ptr,", + f" shape = [%c1, %c1, %c1, %c{rows}, %c{cols}],", + f" strides = [%c{stride}, %c{stride}, %c{stride}, %c{cols}, %c1]", + f" : !pto.tensor_view<1x1x1x{rows}x{cols}x{dst_dtype}>", + "", + f" %src_part = pto.partition_view %src_view,", + f" offsets = [%c0, %c0, %c0, %c0, %c0],", + f" sizes = [%c1, %c1, %c1, %c{v_rows}, %c{v_cols}]", + f" : !pto.tensor_view<1x1x1x{rows}x{cols}x{src_dtype}> -> !pto.partition_tensor_view<1x1x1x{v_rows}x{v_cols}x{src_dtype}>", + f" %dst_part = pto.partition_view %dst_view,", + f" offsets = [%c0, %c0, %c0, %c0, %c0],", + f" sizes = [%c1, %c1, %c1, %c{v_rows}, %c{v_cols}]", + f" : !pto.tensor_view<1x1x1x{rows}x{cols}x{dst_dtype}> -> !pto.partition_tensor_view<1x1x1x{v_rows}x{v_cols}x{dst_dtype}>", + "", + f" %src = pto.alloc_tile", + f" : {tile_src}", + f" %dst = pto.alloc_tile", + f" : {tile_dst}", + "", + f" pto.tload ins(%src_part : !pto.partition_tensor_view<1x1x1x{v_rows}x{v_cols}x{src_dtype}>)", + f" outs(%src : {tile_src})", + "", + f" pto.tcvt ins(%src {{rmode = {gen_rmode_attr(rmode)}}} : {tile_src})" if rmode != "RINT" else f" pto.tcvt ins(%src : {tile_src})", + f" outs(%dst : {tile_dst})", + "", + f" pto.tstore ins(%dst : {tile_dst})", + f" outs(%dst_part : !pto.partition_tensor_view<1x1x1x{v_rows}x{v_cols}x{dst_dtype}>)", + f" return", + f" }}", + "" + ]) + return "\n".join(lines) + +header = """// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// TileLang ST kernels for pto.tcvt. +// Compiled by ptoas --enable-insert-sync --enable-tile-op-expand --pto-backend=vpto +// to produce a fatobj object. +// Generated by gen_tcvt_pto.py from cases.py. + +module { +""" + +footer = "\n}\n" + +if __name__ == "__main__": + from pathlib import Path + HERE = Path(__file__).parent + + with open(HERE / "tcvt.pto", "w") as f: + f.write(header) + f.write("\n".join(gen_kernel(case, idx) for idx, case in enumerate(cases.CASES))) + f.write(footer) + print(f"Generated {(HERE / 'tcvt.pto').as_posix()!r}") diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcvt/launch.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcvt/launch.cpp new file mode 100644 index 000000000..a54eabdfb --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcvt/launch.cpp @@ -0,0 +1,40 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +#include + +#ifndef AICORE +#define AICORE [aicore] +#endif + +extern "C" __global__ AICORE void TCVT_f32_to_f16_1x128(__gm__ float *src, __gm__ uint16_t *dst); +extern "C" __global__ AICORE void TCVT_f16_to_f32_1x129(__gm__ uint16_t *src, __gm__ float *dst); +extern "C" __global__ AICORE void TCVT_bf16_to_i32_1x128(__gm__ uint16_t *src, __gm__ int32_t *dst); +extern "C" __global__ AICORE void TCVT_ui8_to_ui16_1x128(__gm__ uint8_t *src, __gm__ uint16_t *dst); + +void LaunchTCVT_ui8_to_ui16_1x128(void *src, void *dst, void *stream) { + TCVT_ui8_to_ui16_1x128<<<1, nullptr, stream>>>((__gm__ uint8_t *)src, (__gm__ uint16_t *)dst); +} + + + +void LaunchTCVT_bf16_to_i32_1x128(void *src, void *dst, void *stream) { + TCVT_bf16_to_i32_1x128<<<1, nullptr, stream>>>((__gm__ uint16_t *)src, (__gm__ int32_t *)dst); +} + + + +void LaunchTCVT_f16_to_f32_1x129(void *src, void *dst, void *stream) { + TCVT_f16_to_f32_1x129<<<1, nullptr, stream>>>((__gm__ uint16_t *)src, (__gm__ float *)dst); +} + + + +void LaunchTCVT_f32_to_f16_1x128(void *src, void *dst, void *stream) { + TCVT_f32_to_f16_1x128<<<1, nullptr, stream>>>((__gm__ float *)src, (__gm__ uint16_t *)dst); +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcvt/main.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcvt/main.cpp new file mode 100644 index 000000000..e8b09967b --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcvt/main.cpp @@ -0,0 +1,251 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +#include "acl/acl.h" +#include "test_common.h" +#include +#include +#include +#include +#include + +using namespace PtoTestCommon; + +void LaunchTCVT_f32_to_i32_round_16x64(void *src, void *dst, void *stream); +void LaunchTCVT_f32_to_f16_rint_16x64(void *src, void *dst, void *stream); +void LaunchTCVT_f32_to_f16_1x128(void *src, void *dst, void *stream); +void LaunchTCVT_f32_to_f16_4x32(void *src, void *dst, void *stream); +void LaunchTCVT_f32_to_f16_4x65(void *src, void *dst, void *stream); +void LaunchTCVT_f32_to_f16_1x129(void *src, void *dst, void *stream); +void LaunchTCVT_f32_to_bf16_2x64(void *src, void *dst, void *stream); +void LaunchTCVT_f32_to_bf16_2x128(void *src, void *dst, void *stream); +void LaunchTCVT_f32_to_bf16_4x200(void *src, void *dst, void *stream); +void LaunchTCVT_f32_to_i16_1x128(void *src, void *dst, void *stream); +void LaunchTCVT_f32_to_i16_4x32(void *src, void *dst, void *stream); +void LaunchTCVT_f32_to_i16_4x65(void *src, void *dst, void *stream); +void LaunchTCVT_f32_to_i16_1x129(void *src, void *dst, void *stream); +void LaunchTCVT_f32_to_i32_2x64(void *src, void *dst, void *stream); +void LaunchTCVT_f32_to_i32_2x128(void *src, void *dst, void *stream); +void LaunchTCVT_f32_to_i32_4x200(void *src, void *dst, void *stream); +void LaunchTCVT_f32_to_i64_1x128(void *src, void *dst, void *stream); +void LaunchTCVT_f32_to_i64_4x32(void *src, void *dst, void *stream); +void LaunchTCVT_f32_to_i64_4x65(void *src, void *dst, void *stream); +void LaunchTCVT_f32_to_i64_1x129(void *src, void *dst, void *stream); +void LaunchTCVT_f32_to_f32_2x64(void *src, void *dst, void *stream); +void LaunchTCVT_f32_to_f32_2x128(void *src, void *dst, void *stream); +void LaunchTCVT_f32_to_f32_4x200(void *src, void *dst, void *stream); +void LaunchTCVT_f16_to_f32_1x128(void *src, void *dst, void *stream); +void LaunchTCVT_f16_to_f32_4x32(void *src, void *dst, void *stream); +void LaunchTCVT_f16_to_f32_4x65(void *src, void *dst, void *stream); +void LaunchTCVT_f16_to_f32_1x129(void *src, void *dst, void *stream); +void LaunchTCVT_f16_to_i32_2x64(void *src, void *dst, void *stream); +void LaunchTCVT_f16_to_i32_2x128(void *src, void *dst, void *stream); +void LaunchTCVT_f16_to_i32_4x200(void *src, void *dst, void *stream); +void LaunchTCVT_f16_to_i16_1x128(void *src, void *dst, void *stream); +void LaunchTCVT_f16_to_i16_4x32(void *src, void *dst, void *stream); +void LaunchTCVT_f16_to_i16_4x65(void *src, void *dst, void *stream); +void LaunchTCVT_f16_to_i16_1x129(void *src, void *dst, void *stream); +void LaunchTCVT_f16_to_si8_2x64(void *src, void *dst, void *stream); +void LaunchTCVT_f16_to_si8_2x128(void *src, void *dst, void *stream); +void LaunchTCVT_f16_to_si8_4x200(void *src, void *dst, void *stream); +void LaunchTCVT_f16_to_ui8_1x128(void *src, void *dst, void *stream); +void LaunchTCVT_f16_to_ui8_4x32(void *src, void *dst, void *stream); +void LaunchTCVT_f16_to_ui8_4x65(void *src, void *dst, void *stream); +void LaunchTCVT_f16_to_ui8_1x129(void *src, void *dst, void *stream); +void LaunchTCVT_bf16_to_f32_2x64(void *src, void *dst, void *stream); +void LaunchTCVT_bf16_to_f32_2x128(void *src, void *dst, void *stream); +void LaunchTCVT_bf16_to_f32_4x200(void *src, void *dst, void *stream); +void LaunchTCVT_bf16_to_f16_1x128(void *src, void *dst, void *stream); +void LaunchTCVT_bf16_to_f16_4x32(void *src, void *dst, void *stream); +void LaunchTCVT_bf16_to_f16_4x65(void *src, void *dst, void *stream); +void LaunchTCVT_bf16_to_f16_1x129(void *src, void *dst, void *stream); +void LaunchTCVT_bf16_to_i32_1x128(void *src, void *dst, void *stream); +void LaunchTCVT_bf16_to_i32_2x64(void *src, void *dst, void *stream); +void LaunchTCVT_bf16_to_i32_2x128(void *src, void *dst, void *stream); +void LaunchTCVT_bf16_to_i32_4x200(void *src, void *dst, void *stream); +void LaunchTCVT_ui8_to_f16_1x128(void *src, void *dst, void *stream); +void LaunchTCVT_ui8_to_f16_4x32(void *src, void *dst, void *stream); +void LaunchTCVT_ui8_to_f16_4x65(void *src, void *dst, void *stream); +void LaunchTCVT_ui8_to_f16_1x129(void *src, void *dst, void *stream); +void LaunchTCVT_ui8_to_ui16_1x128(void *src, void *dst, void *stream); +void LaunchTCVT_ui8_to_ui16_2x64(void *src, void *dst, void *stream); +void LaunchTCVT_ui8_to_ui16_2x128(void *src, void *dst, void *stream); +void LaunchTCVT_ui8_to_ui16_4x200(void *src, void *dst, void *stream); +void LaunchTCVT_si8_to_f16_1x128(void *src, void *dst, void *stream); +void LaunchTCVT_si8_to_f16_4x32(void *src, void *dst, void *stream); +void LaunchTCVT_si8_to_f16_4x65(void *src, void *dst, void *stream); +void LaunchTCVT_si8_to_f16_1x129(void *src, void *dst, void *stream); +void LaunchTCVT_si8_to_si16_2x64(void *src, void *dst, void *stream); +void LaunchTCVT_si8_to_si16_2x128(void *src, void *dst, void *stream); +void LaunchTCVT_si8_to_si16_4x200(void *src, void *dst, void *stream); +void LaunchTCVT_si8_to_i32_1x128(void *src, void *dst, void *stream); +void LaunchTCVT_si8_to_i32_4x32(void *src, void *dst, void *stream); +void LaunchTCVT_si8_to_i32_4x65(void *src, void *dst, void *stream); +void LaunchTCVT_si8_to_i32_1x129(void *src, void *dst, void *stream); +void LaunchTCVT_i16_to_ui8_2x64(void *src, void *dst, void *stream); +void LaunchTCVT_i16_to_ui8_2x128(void *src, void *dst, void *stream); +void LaunchTCVT_i16_to_ui8_4x200(void *src, void *dst, void *stream); +void LaunchTCVT_i16_to_f16_1x128(void *src, void *dst, void *stream); +void LaunchTCVT_i16_to_f16_4x32(void *src, void *dst, void *stream); +void LaunchTCVT_i16_to_f16_4x65(void *src, void *dst, void *stream); +void LaunchTCVT_i16_to_f16_1x129(void *src, void *dst, void *stream); +void LaunchTCVT_i16_to_f32_2x64(void *src, void *dst, void *stream); +void LaunchTCVT_i16_to_f32_2x128(void *src, void *dst, void *stream); +void LaunchTCVT_i16_to_f32_4x200(void *src, void *dst, void *stream); +void LaunchTCVT_i16_to_ui32_1x128(void *src, void *dst, void *stream); +void LaunchTCVT_i16_to_ui32_4x32(void *src, void *dst, void *stream); +void LaunchTCVT_i16_to_ui32_4x65(void *src, void *dst, void *stream); +void LaunchTCVT_i16_to_ui32_1x129(void *src, void *dst, void *stream); +void LaunchTCVT_i16_to_i32_2x64(void *src, void *dst, void *stream); +void LaunchTCVT_i16_to_i32_2x128(void *src, void *dst, void *stream); +void LaunchTCVT_i16_to_i32_4x200(void *src, void *dst, void *stream); +void LaunchTCVT_i32_to_f32_1x128(void *src, void *dst, void *stream); +void LaunchTCVT_i32_to_f32_4x32(void *src, void *dst, void *stream); +void LaunchTCVT_i32_to_f32_4x65(void *src, void *dst, void *stream); +void LaunchTCVT_i32_to_f32_1x129(void *src, void *dst, void *stream); +void LaunchTCVT_i32_to_i16_2x64(void *src, void *dst, void *stream); +void LaunchTCVT_i32_to_i16_2x128(void *src, void *dst, void *stream); +void LaunchTCVT_i32_to_i16_4x200(void *src, void *dst, void *stream); +void LaunchTCVT_i32_to_i64_1x128(void *src, void *dst, void *stream); +void LaunchTCVT_i32_to_i64_4x32(void *src, void *dst, void *stream); +void LaunchTCVT_i32_to_i64_4x65(void *src, void *dst, void *stream); +void LaunchTCVT_i32_to_i64_1x129(void *src, void *dst, void *stream); +void LaunchTCVT_i32_to_ui8_2x64(void *src, void *dst, void *stream); +void LaunchTCVT_i32_to_ui8_2x128(void *src, void *dst, void *stream); +void LaunchTCVT_i32_to_ui8_4x200(void *src, void *dst, void *stream); +void LaunchTCVT_i32_to_ui16_1x128(void *src, void *dst, void *stream); +void LaunchTCVT_i32_to_ui16_4x32(void *src, void *dst, void *stream); +void LaunchTCVT_i32_to_ui16_4x65(void *src, void *dst, void *stream); +void LaunchTCVT_i32_to_ui16_1x129(void *src, void *dst, void *stream); +void LaunchTCVT_ui32_to_i16_2x64(void *src, void *dst, void *stream); +void LaunchTCVT_ui32_to_i16_2x128(void *src, void *dst, void *stream); +void LaunchTCVT_ui32_to_i16_4x200(void *src, void *dst, void *stream); +void LaunchTCVT_ui32_to_ui16_1x128(void *src, void *dst, void *stream); +void LaunchTCVT_ui32_to_ui16_4x32(void *src, void *dst, void *stream); +void LaunchTCVT_ui32_to_ui16_4x65(void *src, void *dst, void *stream); +void LaunchTCVT_ui32_to_ui16_1x129(void *src, void *dst, void *stream); +void LaunchTCVT_ui32_to_ui8_2x64(void *src, void *dst, void *stream); +void LaunchTCVT_ui32_to_ui8_2x128(void *src, void *dst, void *stream); +void LaunchTCVT_ui32_to_ui8_4x200(void *src, void *dst, void *stream); +void LaunchTCVT_i64_to_f32_1x128(void *src, void *dst, void *stream); +void LaunchTCVT_i64_to_f32_4x32(void *src, void *dst, void *stream); +void LaunchTCVT_i64_to_f32_4x65(void *src, void *dst, void *stream); +void LaunchTCVT_i64_to_f32_1x129(void *src, void *dst, void *stream); +void LaunchTCVT_i64_to_i32_2x64(void *src, void *dst, void *stream); +void LaunchTCVT_i64_to_i32_2x128(void *src, void *dst, void *stream); +void LaunchTCVT_i64_to_i32_4x200(void *src, void *dst, void *stream); + +using LaunchFn = void (*)(void *, void *, void *); + +struct TestCase { + const char *name; + LaunchFn launch; + size_t srcRows; + size_t srcCols; + size_t dstRows; + size_t dstCols; + size_t srcElemSize; + size_t dstElemSize; +}; + +static const TestCase kCases[] = { +{"f32_to_f16_1x128", LaunchTCVT_f32_to_f16_1x128, 1, 128, 1, 128, sizeof(float), sizeof(uint16_t)}, +{"f16_to_f32_1x129", LaunchTCVT_f16_to_f32_1x129, 1, 256, 1, 256, sizeof(uint16_t), sizeof(float)}, +{"bf16_to_i32_1x128", LaunchTCVT_bf16_to_i32_1x128, 1, 128, 1, 128, sizeof(uint16_t), sizeof(int32_t)}, +{"ui8_to_ui16_1x128", LaunchTCVT_ui8_to_ui16_1x128, 1, 128, 1, 128, sizeof(uint8_t), sizeof(uint16_t)}, +}; +static constexpr size_t kNumCases = sizeof(kCases) / sizeof(kCases[0]); + +static int RunCase(const TestCase &tc, int deviceId, aclrtStream stream) { + (void)deviceId; + int rc = 0; + const size_t srcElemCount = tc.srcRows * tc.srcCols; + const size_t dstElemCount = tc.dstRows * tc.dstCols; + size_t srcFileSize = srcElemCount * tc.srcElemSize; + size_t dstFileSize = dstElemCount * tc.dstElemSize; + + std::printf("[INFO] === case: %s (src=%zux%zu, dst=%zux%zu) ===\n", + tc.name, tc.srcRows, tc.srcCols, tc.dstRows, tc.dstCols); + + std::string caseDir = std::string("./") + tc.name; + + void *srcHost = nullptr; + void *dstHost = nullptr; + void *srcDevice = nullptr; + void *dstDevice = nullptr; + + aclrtMallocHost(&srcHost, srcFileSize); + aclrtMallocHost(&dstHost, dstFileSize); + + aclrtMalloc(&srcDevice, srcFileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc(&dstDevice, dstFileSize, ACL_MEM_MALLOC_HUGE_FIRST); + + if (!ReadFile((caseDir + "/input.bin").c_str(), srcFileSize, srcHost, srcFileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input.bin\n", caseDir.c_str()); + rc = 1; + } + + if (rc == 0) { + aclrtMemcpy(srcDevice, srcFileSize, srcHost, srcFileSize, ACL_MEMCPY_HOST_TO_DEVICE); + tc.launch(srcDevice, dstDevice, stream); + aclrtSynchronizeStream(stream); + aclrtMemcpy(dstHost, dstFileSize, dstDevice, dstFileSize, ACL_MEMCPY_DEVICE_TO_HOST); + } + + if (rc == 0 && !WriteFile((caseDir + "/output.bin").c_str(), dstHost, dstFileSize)) { + std::fprintf(stderr, "[ERROR] failed to write %s/output.bin\n", caseDir.c_str()); + rc = 1; + } + + if (srcDevice != nullptr) + aclrtFree(srcDevice); + if (dstDevice != nullptr) + aclrtFree(dstDevice); + if (srcHost != nullptr) + aclrtFreeHost(srcHost); + if (dstHost != nullptr) + aclrtFreeHost(dstHost); + + if (rc == 0) + std::printf("[INFO] case %s done\n", tc.name); + return rc; +} + +int main(int argc, char *argv[]) { + const char *caseFilter = (argc > 1) ? argv[1] : nullptr; + + int rc = 0; + int deviceId = 0; + aclrtStream stream = nullptr; + + aclInit(nullptr); + if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { + deviceId = std::atoi(envDevice); + } + aclrtSetDevice(deviceId); + aclrtCreateStream(&stream); + + for (size_t i = 0; i < kNumCases; ++i) { + if (caseFilter != nullptr && std::strcmp(kCases[i].name, caseFilter) != 0) { + continue; + } + int ret = RunCase(kCases[i], deviceId, stream); + if (ret != 0) { + std::fprintf(stderr, "[ERROR] case %s failed\n", kCases[i].name); + rc = 1; + break; + } + } + + if (stream != nullptr) + aclrtDestroyStream(stream); + aclrtResetDevice(deviceId); + aclFinalize(); + + return rc; +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcvt/tcvt.pto b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcvt/tcvt.pto new file mode 100644 index 000000000..3d3bfad1a --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tcvt/tcvt.pto @@ -0,0 +1,180 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// TileLang ST kernels for pto.tcvt. +// Compiled by ptoas --enable-insert-sync --enable-tile-op-expand --pto-backend=vpto +// to produce a fatobj object. +// Generated by gen_tcvt_pto.py from cases.py. + +module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { + // Case 0: f32 -> i32, default RINT + func.func @TCVT_f32_to_f16_1x128(%src_ptr: !pto.ptr, %dst_ptr: !pto.ptr) attributes { pto.entry , pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c128 = arith.constant 128 : index + + %src_view = pto.make_tensor_view %src_ptr, + shape = [%c1, %c1, %c1, %c1, %c128], + strides = [%c128, %c128, %c128, %c128, %c1] + : !pto.tensor_view<1x1x1x1x128xf32> + %dst_view = pto.make_tensor_view %dst_ptr, + shape = [%c1, %c1, %c1, %c1, %c128], + strides = [%c128, %c128, %c128, %c128, %c1] + : !pto.tensor_view<1x1x1x1x128xf16> + + %src_part = pto.partition_view %src_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c1, %c128] + : !pto.tensor_view<1x1x1x1x128xf32> -> !pto.partition_tensor_view<1x1x1x1x128xf32> + %dst_part = pto.partition_view %dst_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c1, %c128] + : !pto.tensor_view<1x1x1x1x128xf16> -> !pto.partition_tensor_view<1x1x1x1x128xf16> + + %src = pto.alloc_tile + : !pto.tile_buf + %dst = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%src_part : !pto.partition_tensor_view<1x1x1x1x128xf32>) + outs(%src : !pto.tile_buf) + + pto.tcvt ins(%src : !pto.tile_buf) + outs(%dst : !pto.tile_buf) + + pto.tstore ins(%dst : !pto.tile_buf) + outs(%dst_part : !pto.partition_tensor_view<1x1x1x1x128xf16>) + return + } + + // Case 6: f32 -> f16, default RINT + + func.func @TCVT_f16_to_f32_1x129(%src_ptr: !pto.ptr, %dst_ptr: !pto.ptr) attributes { pto.entry , pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c129 = arith.constant 129 : index + %c256 = arith.constant 256 : index + + %src_view = pto.make_tensor_view %src_ptr, + shape = [%c1, %c1, %c1, %c1, %c256], + strides = [%c256, %c256, %c256, %c256, %c1] + : !pto.tensor_view<1x1x1x1x256xf16> + %dst_view = pto.make_tensor_view %dst_ptr, + shape = [%c1, %c1, %c1, %c1, %c256], + strides = [%c256, %c256, %c256, %c256, %c1] + : !pto.tensor_view<1x1x1x1x256xf32> + + %src_part = pto.partition_view %src_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c1, %c129] + : !pto.tensor_view<1x1x1x1x256xf16> -> !pto.partition_tensor_view<1x1x1x1x129xf16> + %dst_part = pto.partition_view %dst_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c1, %c129] + : !pto.tensor_view<1x1x1x1x256xf32> -> !pto.partition_tensor_view<1x1x1x1x129xf32> + + %src = pto.alloc_tile + : !pto.tile_buf + %dst = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%src_part : !pto.partition_tensor_view<1x1x1x1x129xf16>) + outs(%src : !pto.tile_buf) + + pto.tcvt ins(%src : !pto.tile_buf) + outs(%dst : !pto.tile_buf) + + pto.tstore ins(%dst : !pto.tile_buf) + outs(%dst_part : !pto.partition_tensor_view<1x1x1x1x129xf32>) + return + } + + // Case 54: f16 -> i32, default RINT + + func.func @TCVT_bf16_to_i32_1x128(%src_ptr: !pto.ptr, %dst_ptr: !pto.ptr) attributes { pto.entry , pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c128 = arith.constant 128 : index + + %src_view = pto.make_tensor_view %src_ptr, + shape = [%c1, %c1, %c1, %c1, %c128], + strides = [%c128, %c128, %c128, %c128, %c1] + : !pto.tensor_view<1x1x1x1x128xbf16> + %dst_view = pto.make_tensor_view %dst_ptr, + shape = [%c1, %c1, %c1, %c1, %c128], + strides = [%c128, %c128, %c128, %c128, %c1] + : !pto.tensor_view<1x1x1x1x128xi32> + + %src_part = pto.partition_view %src_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c1, %c128] + : !pto.tensor_view<1x1x1x1x128xbf16> -> !pto.partition_tensor_view<1x1x1x1x128xbf16> + %dst_part = pto.partition_view %dst_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c1, %c128] + : !pto.tensor_view<1x1x1x1x128xi32> -> !pto.partition_tensor_view<1x1x1x1x128xi32> + + %src = pto.alloc_tile + : !pto.tile_buf + %dst = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%src_part : !pto.partition_tensor_view<1x1x1x1x128xbf16>) + outs(%src : !pto.tile_buf) + + pto.tcvt ins(%src : !pto.tile_buf) + outs(%dst : !pto.tile_buf) + + pto.tstore ins(%dst : !pto.tile_buf) + outs(%dst_part : !pto.partition_tensor_view<1x1x1x1x128xi32>) + return + } + + // Case 97: bf16 -> i32, default RINT + + func.func @TCVT_ui8_to_ui16_1x128(%src_ptr: !pto.ptr, %dst_ptr: !pto.ptr) attributes { pto.entry , pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c128 = arith.constant 128 : index + + %src_view = pto.make_tensor_view %src_ptr, + shape = [%c1, %c1, %c1, %c1, %c128], + strides = [%c128, %c128, %c128, %c128, %c1] + : !pto.tensor_view<1x1x1x1x128xui8> + %dst_view = pto.make_tensor_view %dst_ptr, + shape = [%c1, %c1, %c1, %c1, %c128], + strides = [%c128, %c128, %c128, %c128, %c1] + : !pto.tensor_view<1x1x1x1x128xui16> + + %src_part = pto.partition_view %src_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c1, %c128] + : !pto.tensor_view<1x1x1x1x128xui8> -> !pto.partition_tensor_view<1x1x1x1x128xui8> + %dst_part = pto.partition_view %dst_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c1, %c128] + : !pto.tensor_view<1x1x1x1x128xui16> -> !pto.partition_tensor_view<1x1x1x1x128xui16> + + %src = pto.alloc_tile + : !pto.tile_buf + %dst = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%src_part : !pto.partition_tensor_view<1x1x1x1x128xui8>) + outs(%src : !pto.tile_buf) + + pto.tcvt ins(%src : !pto.tile_buf) + outs(%dst : !pto.tile_buf) + + pto.tstore ins(%dst : !pto.tile_buf) + outs(%dst_part : !pto.partition_tensor_view<1x1x1x1x128xui16>) + return + } + + // Case 111: ui8 -> ui16, default RINT +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tdiv/CMakeLists.txt b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tdiv/CMakeLists.txt new file mode 100644 index 000000000..446932f9e --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tdiv/CMakeLists.txt @@ -0,0 +1,9 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +pto_tilelang_vec_st(tdiv) diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tdiv/cases.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tdiv/cases.py new file mode 100644 index 000000000..83f9719bf --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tdiv/cases.py @@ -0,0 +1,213 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +"""Single source of truth for tdiv ST test cases. + +Each case defines: + - name: case identifier, used as subdirectory name and by main.cpp kCases[]. + - dtype: numpy dtype (e.g. np.float32). + - shape: (rows, cols) — allocated tile dimensions. + - valid_shape: (valid_rows, valid_cols) — effective computation region. + - eps: tolerance for numpy.allclose (atol and rtol). + - precision_type: optional, "default" or "high_precision". + - test_pattern: optional, "normal", "boundary", "subnormal", "overflow", "nan_inf" + +gen_data.py and compare.py both import this list to avoid redundant definitions. +""" + +import numpy as np + +CASES = [ + # ============================================================ + # Normal cases - basic functionality (DEFAULT precision mode) + # ============================================================ + { + "name": "f32_16x64", + "dtype": np.float32, + "shape": (16, 64), + "valid_shape": (16, 64), + "eps": 1e-6, + "test_pattern": "normal", + }, + { + "name": "f32_32x32", + "dtype": np.float32, + "shape": (32, 32), + "valid_shape": (32, 32), + "eps": 1e-6, + "test_pattern": "normal", + }, + { + "name": "f32_64x64", + "dtype": np.float32, + "shape": (64, 64), + "valid_shape": (64, 64), + "eps": 1e-6, + "test_pattern": "normal", + }, + { + "name": "f16_16x256", + "dtype": np.float16, + "shape": (16, 256), + "valid_shape": (16, 256), + "eps": 1e-3, + "test_pattern": "normal", + }, + + # ============================================================ + # HIGH_PRECISION mode - comprehensive boundary tests + # ============================================================ + # Precision-sensitive ratios (1/3, 1/7, 7/3) - tests three-candidate search + { + "name": "f32_16x64_hp_precision", + "dtype": np.float32, + "shape": (16, 64), + "valid_shape": (16, 64), + "eps": 1e-6, + "precision_type": "high_precision", + "test_pattern": "precision_sensitive", + "ulp_tolerance": 1, # Allow ±1 ULP for high-precision algorithm + }, + { + "name": "f16_16x64_hp_precision", + "dtype": np.float16, + "shape": (16, 64), + "valid_shape": (16, 64), + "eps": 1e-3, + "precision_type": "high_precision", + "test_pattern": "precision_sensitive", + "ulp_tolerance": 1, + }, + + # Subnormal numbers - tests denormal normalization and compensation + { + "name": "f32_16x64_hp_subnormal", + "dtype": np.float32, + "shape": (16, 64), + "valid_shape": (16, 64), + "eps": 1e-6, + "precision_type": "high_precision", + "test_pattern": "subnormal", + "ulp_tolerance": 2, # Subnormal handling may have ±2 ULP variance + }, + { + "name": "f16_16x64_hp_subnormal", + "dtype": np.float16, + "shape": (16, 64), + "valid_shape": (16, 64), + "eps": 1e-3, + "precision_type": "high_precision", + "test_pattern": "subnormal", + "ulp_tolerance": 2, + }, + +# Overflow/Underflow boundaries - tests exponent handling + { + "name": "f32_16x64_hp_overflow", + "dtype": np.float32, + "shape": (16, 64), + "valid_shape": (16, 64), + "eps": 1e-6, + "precision_type": "high_precision", + "test_pattern": "overflow", + }, + { + "name": "f16_16x64_hp_overflow", + "dtype": np.float16, + "shape": (16, 64), + "valid_shape": (16, 64), + "eps": 1e-3, + "precision_type": "high_precision", + "test_pattern": "overflow", + }, + + # Different shapes - test tile size variations + { + "name": "f32_32x32_hp", + "dtype": np.float32, + "shape": (32, 32), + "valid_shape": (32, 32), + "eps": 1e-5, + "precision_type": "high_precision", + "test_pattern": "precision_sensitive", + "ulp_tolerance": 2, + }, + { + "name": "f32_64x64_hp", + "dtype": np.float32, + "shape": (64, 64), + "valid_shape": (64, 64), + "eps": 1e-5, + "precision_type": "high_precision", + "test_pattern": "precision_sensitive", + "ulp_tolerance": 2, + }, + { + "name": "f16_16x256_hp", + "dtype": np.float16, + "shape": (16, 256), + "valid_shape": (16, 256), + "eps": 1e-3, + "precision_type": "high_precision", + "test_pattern": "precision_sensitive", + "ulp_tolerance": 2, + }, + + # Partial valid shape - test masked computation + { + "name": "f32_16x64_hp_partial", + "dtype": np.float32, + "shape": (16, 64), + "valid_shape": (16, 31), + "eps": 1e-5, + "precision_type": "high_precision", + "test_pattern": "precision_sensitive", + "ulp_tolerance": 2, + }, + { + "name": "f16_16x64_hp_partial", + "dtype": np.float16, + "shape": (16, 64), + "valid_shape": (16, 63), + "eps": 1e-3, + "precision_type": "high_precision", + "test_pattern": "precision_sensitive", + "ulp_tolerance": 2, + }, + + # Small shape HP tests - aligned with pto-isa (case_float_hp_2x16, case_half_hp_2x32) + { + "name": "f32_2x16_hp", + "dtype": np.float32, + "shape": (2, 16), + "valid_shape": (2, 16), + "eps": 1e-6, + "precision_type": "high_precision", + "test_pattern": "precision_sensitive", + "ulp_tolerance": 1, + }, + { + "name": "f16_2x32_hp", + "dtype": np.float16, + "shape": (2, 32), + "valid_shape": (2, 32), + "eps": 1e-3, + "precision_type": "high_precision", + "test_pattern": "precision_sensitive", + "ulp_tolerance": 1, + }, +] + +_SMOKE_CASE_NAMES = ['f32_16x64', 'f32_16x64_hp_subnormal', 'f16_16x64_hp_partial'] +_SMOKE_CASE_NAME_SET = set(_SMOKE_CASE_NAMES) +_missing = [name for name in _SMOKE_CASE_NAMES if name not in {case["name"] for case in CASES}] +if _missing: + raise RuntimeError("unknown smoke case(s): " + ", ".join(_missing)) +CASES = [case for case in CASES if case["name"] in _SMOKE_CASE_NAME_SET] diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tdiv/compare.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tdiv/compare.py new file mode 100644 index 000000000..90e0912da --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tdiv/compare.py @@ -0,0 +1,295 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import os +import sys +import numpy as np +from pathlib import Path + +# Add current directory to path for standalone execution +script_dir = Path(__file__).parent +if script_dir not in sys.path: + sys.path.insert(0, str(script_dir)) + +# Add st_common directory +st_common_dir = script_dir.parent +if st_common_dir not in sys.path: + sys.path.insert(0, str(st_common_dir)) + +from cases import CASES +from st_common import result_cmp, style_fail, style_pass, validate_cases + + +def compute_ulp_difference(golden, output, dtype): + """Compute ULP (Unit in the Last Place) difference between two arrays. + + ULP difference measures how many representable floating-point values + are between golden and output. + + Note: Only computes ULP for normal values (not NaN/Inf/zero). + + Args: + golden: numpy array of golden values + output: numpy array of output values + dtype: numpy dtype (float32 or float16) + + Returns: + Maximum ULP difference across all normal elements, or None if no normal values + """ + if dtype == np.float32: + int_dtype = np.uint32 + elif dtype == np.float16: + int_dtype = np.uint16 + else: + return None # ULP not applicable for integer types + + # Filter out NaN, Inf, and zero values (ULP not meaningful for these) + golden_normal = np.isfinite(golden) & (golden != 0) + output_normal = np.isfinite(output) & (output != 0) + normal_mask = golden_normal & output_normal + + if not np.any(normal_mask): + return None # No normal values to compare + + golden_filtered = golden[normal_mask] + output_filtered = output[normal_mask] + + # Convert to integer representation for ULP calculation + golden_int = golden_filtered.view(int_dtype) + output_int = output_filtered.view(int_dtype) + + # Handle sign difference: ULP counts across zero + # For same sign: simple difference + # For different sign: add both magnitudes (crosses zero boundary) + sign_bit = np.dtype(int_dtype).itemsize * 8 - 1 + golden_sign = golden_int >> sign_bit + output_sign = output_int >> sign_bit + + same_sign = (golden_sign == output_sign) + + # For same sign: subtract representations + ulp_diff_same = np.abs(golden_int.astype(np.int64) - output_int.astype(np.int64)) + + # For different sign: distance through zero (less common, treat as large difference) + # Use maximum possible ULP for different signs + ulp_diff_cross = np.iinfo(int_dtype).max + + ulp_diff = np.where(same_sign, ulp_diff_same, ulp_diff_cross) + + return np.max(ulp_diff) + + +def check_nan_inf_consistency(golden, output, relaxed=False): + """Check that NaN and Inf positions and values are consistent. + + IEEE 754 rules: + - NaN must appear at similar positions (hardware may differ in NaN type) + - Inf must have same sign at same positions + - Both must agree on which positions are NaN vs Inf vs normal + + Args: + golden: numpy array of golden values + output: numpy array of output values + relaxed: if True, allow NaN count differences (hardware may have different NaN handling) + + Returns: + (ok, error_msg) tuple + """ + # Check NaN positions + golden_nan = np.isnan(golden) + output_nan = np.isnan(output) + + # For relaxed mode, check NaN counts are similar (allow some variance) + if relaxed: + golden_nan_count = np.sum(golden_nan) + output_nan_count = np.sum(output_nan) + # Allow 20% variance in NaN count + if golden_nan_count > 0: + variance = abs(golden_nan_count - output_nan_count) / float(golden_nan_count) + if variance > 0.2: + return False, "NaN count variance > 20% (golden={}, output={})".format(golden_nan_count, output_nan_count) + # Continue with other checks even if NaN positions differ + else: + if not np.array_equal(golden_nan, output_nan): + nan_mismatch = np.where(golden_nan != output_nan) + return False, "NaN position mismatch at {} positions".format(len(nan_mismatch[0])) + + # Check Inf positions + golden_inf = np.isinf(golden) + output_inf = np.isinf(output) + + if not np.array_equal(golden_inf, output_inf): + inf_mismatch = np.where(golden_inf != output_inf) + return False, f"Inf position mismatch at {len(inf_mismatch[0])} positions" + + # Check Inf signs + if np.any(golden_inf): + golden_signs = np.sign(golden[golden_inf]) + output_signs = np.sign(output[golden_inf]) + if not np.array_equal(golden_signs, output_signs): + return False, "Inf sign mismatch" + + return True, None + + +def compare_high_precision_result(golden, output, dtype, ulp_tolerance=1, eps=1e-6, relaxed_nan=False): + """Compare results for HIGH_PRECISION mode. + + High-precision algorithm uses three-candidate search which may select + a different but more accurate rounding than numpy standard division. + + Comparison strategy: + 1. Check NaN/Inf consistency (may allow relaxed NaN checking) + 2. For normal/subnormal values: allow ±ulp_tolerance ULP difference + + Args: + golden: numpy array of reference values (numpy division) + output: numpy array of NPU output values + dtype: numpy dtype + ulp_tolerance: maximum allowed ULP difference (default 1) + eps: fallback tolerance for non-float types + relaxed_nan: if True, allow NaN count variance (default False) + + Returns: + (ok, error_msg) tuple + """ + # 1. Check NaN/Inf consistency + ok, error_msg = check_nan_inf_consistency(golden, output, relaxed=relaxed_nan) + if not ok: + return False, error_msg + + # 2. Filter out NaN/Inf for numerical comparison + golden_nan = np.isnan(golden) + golden_inf = np.isinf(golden) + normal_mask = ~(golden_nan | golden_inf) + + if not np.any(normal_mask): + return True, None # All NaN/Inf, already checked + + golden_normal = golden[normal_mask] + output_normal = output[normal_mask] + + # 3. Use ULP tolerance for float types + if dtype in (np.float32, np.float16): + max_ulp = compute_ulp_difference(golden_normal, output_normal, dtype) + if max_ulp is not None and max_ulp <= ulp_tolerance: + return True, f"ULP tolerance passed (max_ulp={max_ulp})" + + # Fallback to eps-based comparison if ULP check fails + ok = result_cmp(golden_normal, output_normal, eps) + if not ok: + return False, f"Both ULP ({max_ulp}) and eps ({eps}) check failed" + return True, f"Passed with eps tolerance (max_ulp={max_ulp} > {ulp_tolerance})" + + # 4. For integer types, use exact comparison + else: + ok = np.array_equal(golden_normal, output_normal) + if not ok: + mismatch = np.where(golden_normal != output_normal) + return False, f"Mismatch at {len(mismatch[0])} positions" + return True, None + + +def main(): + validate_cases(CASES) + case_filter = sys.argv[1] if len(sys.argv) > 1 else None + + all_passed = True + for case in CASES: + if case_filter is not None and case["name"] != case_filter: + continue + + case_dir = case["name"] + shape = case["shape"] + vr, vc = case["valid_shape"] + test_pattern = case.get("test_pattern", "normal") + precision_type = case.get("precision_type", "default") + check_inf_nan = case.get("check_inf_nan", False) + + golden = np.fromfile(os.path.join(case_dir, "golden.bin"), dtype=case["dtype"]).reshape(shape) + output = np.fromfile(os.path.join(case_dir, "output.bin"), dtype=case["dtype"]).reshape(shape) + + eps = case["eps"] + dtype_name = case["dtype"].__name__ + + # Extract valid region + golden_valid = golden[:vr, :vc] + output_valid = output[:vr, :vc] + + # Integer types: exact comparison + if dtype_name in ("uint32", "int32", "uint16", "int16", "uint8", "int8"): + ok = np.array_equal(golden_valid, output_valid) + if not ok: + mismatch = np.where(golden_valid != output_valid) + print(style_fail(f"[ERROR] {case['name']}: mismatches at {len(mismatch[0])} positions")) + if len(mismatch[0]) > 0 and len(mismatch[0]) <= 10: + for i in range(len(mismatch[0])): + r, c = mismatch[0][i], mismatch[1][i] + print(f" [{r},{c}] golden={golden_valid[r,c]} output={output_valid[r,c]}") + all_passed = False + continue + + # Float types with special handling + else: + # HIGH_PRECISION mode: use ULP tolerance + if precision_type == "high_precision": + ulp_tolerance = case.get("ulp_tolerance", 1) + # Use relaxed NaN checking for nan_inf and boundary tests + relaxed_nan = test_pattern in ("nan_inf", "boundary") + ok, msg = compare_high_precision_result( + golden_valid, output_valid, case["dtype"], + ulp_tolerance=ulp_tolerance, eps=eps, relaxed_nan=relaxed_nan + ) + if not ok: + print(style_fail("[ERROR] {}: {} (test={})".format(case['name'], msg, test_pattern))) + all_passed = False + continue + elif msg: + print(style_pass("[INFO] {}: {} (test={})".format(case['name'], msg, test_pattern))) + + # check_inf_nan flag or boundary test: check NaN/Inf separately + elif check_inf_nan or test_pattern == "boundary": + # Use relaxed NaN checking for nan_inf and boundary tests + relaxed = test_pattern in ("nan_inf", "boundary") + ok, msg = check_nan_inf_consistency(golden_valid, output_valid, relaxed=relaxed) + if not ok: + print(style_fail("[ERROR] {}: {} (test={})".format(case['name'], msg, test_pattern))) + all_passed = False + continue + + # Compare non-special values + golden_nan = np.isnan(golden_valid) + golden_inf = np.isinf(golden_valid) + normal_mask = ~(golden_nan | golden_inf) + + if np.any(normal_mask): + ok = result_cmp(golden_valid[normal_mask], output_valid[normal_mask], eps) + if not ok: + print(style_fail("[ERROR] {}: numerical mismatch (test={})".format(case['name'], test_pattern))) + all_passed = False + continue + + # Normal test: standard comparison + else: + ok = result_cmp(golden_valid, output_valid, eps) + if not ok: + print(style_fail("[ERROR] {}: comparison failed (test={})".format(case['name'], test_pattern))) + all_passed = False + continue + + print(style_pass("[INFO] {}: passed (dtype={}, precision={}, test={})".format(case['name'], dtype_name, precision_type, test_pattern))) + + if not all_passed: + sys.exit(2) + print(style_pass("[INFO] all cases passed")) + + +if __name__ == "__main__": + main() diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tdiv/gen_data.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tdiv/gen_data.py new file mode 100644 index 000000000..9ecd4f8c9 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tdiv/gen_data.py @@ -0,0 +1,327 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import sys +import os +from pathlib import Path + +# Add current directory to path for standalone execution +script_dir = Path(__file__).parent +if script_dir not in sys.path: + sys.path.insert(0, str(script_dir)) + +# Add st_common directory +st_common_dir = script_dir.parent +if st_common_dir not in sys.path: + sys.path.insert(0, str(st_common_dir)) + +import numpy as np +from cases import CASES +from st_common import validate_cases, setup_case_rng, save_case_data + +validate_cases(CASES) + + +def generate_precision_sensitive_data(shape, dtype): + """Generate precision-sensitive ratios to test three-candidate search algorithm. + + Focuses on values that cannot be exactly represented in floating point: + - 1/3, 1/7, 7/3 - infinite binary representation + - Values near integer boundaries where z/z±1 compete + """ + rows, cols = shape + input1 = np.zeros(shape, dtype=dtype) + input2 = np.ones(shape, dtype=dtype) + + ratios = [(1, 3), (1, 7), (7, 3), (1, 11), (5, 3), (10, 3)] + + section_size = rows // len(ratios) + for i, (a, b) in enumerate(ratios): + start_row = i * section_size + end_row = min((i + 1) * section_size, rows) + input1[start_row:end_row, :] = dtype(a) + input2[start_row:end_row, :] = dtype(b) + + # Add variations: negative versions, different signs + remaining_rows = rows - len(ratios) * section_size + if remaining_rows > 0: + input1[-remaining_rows:, :] = np.random.choice([-1, 1], size=(remaining_rows, cols)).astype(dtype) + input2[-remaining_rows:, :] = dtype(3) + + return input1, input2 + + +def generate_subnormal_test_data(shape, dtype): + """Generate subnormal (denormal) numbers to test normalization handling. + + NOTE: High-precision division algorithm (Div754) has asymmetric subnormal detection: + - src0 (dividend): EQ comparison - only detects MAX_SUBNORMAL (0x007FFFFF for f32) + - src1 (divisor): LT comparison - detects entire subnormal range + + Test design constraints: + - Section 1: src0 = MAX_SUBNORMAL, src1 = normal (tests src0 EQ detection) + - Section 2: src0 = MAX_SUBNORMAL, src1 = larger subnormal (tests both subnormal) + - Section 3: src0 = normal, src1 = MAX_SUBNORMAL (tests src1 subnormal with normal src0) + - Section 4: normal reference + + Avoid "normal / small_subnormal" which would overflow to Inf. + """ + rows, cols = shape + input1 = np.zeros(shape, dtype=dtype) + input2 = np.ones(shape, dtype=dtype) + + if dtype == np.float32: + tiny = np.finfo(np.float32).tiny + subnormal_max = np.frombuffer(np.array([0x007FFFFF], dtype=np.uint32), dtype=np.float32)[0] + subnormal_min = np.float32(1e-45) + normal_min = tiny * np.float32(2.0) + else: # float16 + tiny = np.finfo(np.float16).tiny + subnormal_max = np.frombuffer(np.array([0x03FF], dtype=np.uint16), dtype=np.float16)[0] + subnormal_min = np.float16(1e-8) + normal_min = tiny * np.float16(2.0) + + quarter = rows // 4 + + # Section 1: src0 = MAX_SUBNORMAL, src1 = normal + # ratio ≈ 1e-38 / 10 ≈ 1e-39 (不 overflow) + input1[:quarter, :] = subnormal_max + input2[:quarter, :] = np.random.uniform(normal_min, 100.0, size=(quarter, cols)).astype(dtype) + + # Section 2: src0 = MAX_SUBNORMAL, src1 = smaller subnormal (ratio ≈ 1-10) + # 确保 src1 在 subnormal 范围内: subnormal_min ~ subnormal_max + input1[quarter:2*quarter, :] = subnormal_max + input2[quarter:2*quarter, :] = np.random.uniform(subnormal_max * 0.1, subnormal_max, + size=(quarter, cols)).astype(dtype) + + # Section 3: src0 = MAX_SUBNORMAL, src1 = very small subnormal (ratio ≈ 10-500) + input1[2*quarter:3*quarter, :] = subnormal_max + input2[2*quarter:3*quarter, :] = np.random.uniform(subnormal_min, subnormal_max * 0.1, + size=(quarter, cols)).astype(dtype) + + # Section 4: normal reference + input1[3*quarter:, :] = np.random.uniform(0.1, 100.0, size=(rows-3*quarter, cols)).astype(dtype) + input2[3*quarter:, :] = np.random.uniform(0.1, 100.0, size=(rows-3*quarter, cols)).astype(dtype) + + return input1, input2 + + +def generate_overflow_test_data(shape, dtype): + """Generate overflow/underflow boundary values to test exponent handling. + + Tests: + - Large/small ratios that overflow to Inf + - Tiny ratios that underflow to 0 or min denormal + - Values at max/min exponent boundaries + """ + rows, cols = shape + input1 = np.zeros(shape, dtype=dtype) + input2 = np.ones(shape, dtype=dtype) + + if dtype == np.float32: + large_val = np.float32(1e30) + tiny_val = np.float32(1e-30) + overflow_trigger = np.float32(1e38) + underflow_trigger = np.float32(1e-45) + max_normal = np.float32(3.4e38) + else: # float16 + large_val = np.float16(60000) # Near f16 max (65504) + tiny_val = np.float16(0.0001) + overflow_trigger = np.float16(65000) + underflow_trigger = np.float16(1e-7) + max_normal = np.float16(65504) + + # Section 1: Overflow scenarios + quarter = rows // 4 + input1[:quarter, :cols//2] = overflow_trigger + input2[:quarter, :cols//2] = tiny_val # overflow_trigger / tiny_val -> Inf + + input1[:quarter, cols//2:] = large_val + input2[:quarter, cols//2:] = np.random.uniform(1e-35 if dtype==np.float32 else 1e-7, + tiny_val, + size=(quarter, cols//2)).astype(dtype) + + # Section 2: Underflow scenarios + input1[quarter:2*quarter, :cols//2] = underflow_trigger + input2[quarter:2*quarter, :cols//2] = large_val # underflow_trigger / large_val -> 0 + + input1[quarter:2*quarter, cols//2:] = tiny_val + input2[quarter:2*quarter, cols//2:] = np.random.uniform(large_val, max_normal, + size=(quarter, cols//2)).astype(dtype) + + # Section 3: Near boundary (may or may not overflow) + input1[2*quarter:3*quarter, :] = np.random.uniform(large_val/10, max_normal, + size=(quarter, cols)).astype(dtype) + input2[2*quarter:3*quarter, :] = np.random.uniform(tiny_val/10, tiny_val, + size=(quarter, cols)).astype(dtype) + + # Section 4: Normal values (control group) + input1[3*quarter:, :] = np.random.uniform(0.1, 100.0, + size=(rows-3*quarter, cols)).astype(dtype) + input2[3*quarter:, :] = np.random.uniform(0.1, 100.0, + size=(rows-3*quarter, cols)).astype(dtype) + + return input1, input2 + + +def generate_nan_inf_test_data(shape, dtype): + """Generate NaN and Inf inputs to test special value propagation. + + Tests IEEE 754 rules: + - 0/0 -> NaN + - Inf/Inf -> NaN + - x/0 -> Inf (or NaN if x=0) + - Inf/x -> Inf + - x/Inf -> 0 + - NaN propagates + """ + rows, cols = shape + input1 = np.zeros(shape, dtype=dtype) + input2 = np.ones(shape, dtype=dtype) + + # Create special values + if dtype == np.float32: + pos_inf = np.float32(np.inf) + neg_inf = np.float32(-np.inf) + nan_val = np.float32(np.nan) + zero_val = np.float32(0.0) + pos_one = np.float32(1.0) + neg_one = np.float32(-1.0) + else: # float16 + pos_inf = np.float16(np.inf) + neg_inf = np.float16(-np.inf) + nan_val = np.float16(np.nan) + zero_val = np.float16(0.0) + pos_one = np.float16(1.0) + neg_one = np.float16(-1.0) + + # Section 1: 0/0 -> NaN, x/0 -> Inf + eighth = rows // 8 + input1[0:eighth, :] = zero_val + input2[0:eighth, :] = zero_val # 0/0 -> NaN + + input1[eighth:2*eighth, :] = pos_one + input2[eighth:2*eighth, :] = zero_val # 1/0 -> Inf + + input1[2*eighth:3*eighth, :] = neg_one + input2[2*eighth:3*eighth, :] = zero_val # -1/0 -> -Inf + + # Section 2: Inf/Inf -> NaN, Inf/x -> Inf, x/Inf -> 0 + input1[3*eighth:4*eighth, :] = pos_inf + input2[3*eighth:4*eighth, :] = pos_inf # Inf/Inf -> NaN + + input1[4*eighth:5*eighth, :] = pos_inf + input2[4*eighth:5*eighth, :] = pos_one # Inf/1 -> Inf + + input1[5*eighth:6*eighth, :] = pos_one + input2[5*eighth:6*eighth, :] = pos_inf # 1/Inf -> 0 + + # Section 3: NaN propagation + input1[6*eighth:7*eighth, :] = nan_val + input2[6*eighth:7*eighth, :] = np.random.uniform(0.1, 10.0, + size=(eighth, cols)).astype(dtype) # NaN/x -> NaN + + input1[7*eighth:rows, :] = np.random.uniform(0.1, 10.0, + size=(rows-7*eighth, cols)).astype(dtype) + input2[7*eighth:rows, :cols//2] = nan_val # x/NaN -> NaN (half of remaining) + input2[7*eighth:rows, cols//2:] = np.random.uniform(0.1, 10.0, + size=(rows-7*eighth, cols//2)).astype(dtype) + + return input1, input2 + + +def generate_boundary_test_data(shape, dtype): + """Generate mixed boundary test data to stress IEEE 754 compliance. + + Combines subnormal and overflow scenarios (no NaN/Inf to avoid hardware limitations). + """ + rows, cols = shape + input1 = np.zeros(shape, dtype=dtype) + input2 = np.ones(shape, dtype=dtype) + + # Adapt thresholds based on dtype + if dtype == np.float32: + subnormal_val = np.float32(1.175e-38) + large_val = np.float32(1e30) + tiny_val = np.float32(1e-10) + elif dtype == np.float16: + subnormal_val = np.float16(6e-5) + large_val = np.float16(60000) + tiny_val = np.float16(0.001) + else: + subnormal_val = np.float32(1e-38) + large_val = np.float32(1e30) + tiny_val = np.float32(1e-10) + + # Section 1: Subnormal numbers (first half) + half = rows // 2 + if dtype == np.float32: + input1[:half, :] = np.random.uniform(1e-40, subnormal_val, + size=(half, cols)).astype(dtype) + else: + input1[:half, :] = np.random.uniform(1e-8, subnormal_val, + size=(half, cols)).astype(dtype) + input2[:half, :] = np.random.uniform(1.0, 10.0, + size=(half, cols)).astype(dtype) + + # Section 2: Overflow boundary (second half) + input1[half:, :cols//2] = large_val + input2[half:, :cols//2] = tiny_val + + input1[half:, cols//2:] = np.random.uniform(large_val/10, large_val, + size=(half, cols//2)).astype(dtype) + input2[half:, cols//2:] = np.random.uniform(tiny_val/10, tiny_val, + size=(half, cols//2)).astype(dtype) + + return input1, input2 + + +def generate_normal_data(shape, dtype): + """Generate simple random values for normal testing.""" + if dtype in (np.int32, np.int16, np.int8, np.uint8, np.uint16, np.uint32): + input1 = np.random.randint(1, 10, size=shape).astype(dtype) + input2 = np.random.randint(1, 10, size=shape).astype(dtype) + else: + input1 = np.random.uniform(0.1, 100.0, size=shape).astype(dtype) + input2 = np.random.uniform(0.1, 100.0, size=shape).astype(dtype) + return input1, input2 + +for case in CASES: + setup_case_rng(case) + + dtype = case["dtype"] + shape = case["shape"] + valid_shape = case["valid_shape"] + test_pattern = case.get("test_pattern", "normal") + + # Generate test data based on pattern + # NOTE: nan_inf test removed due to hardware vdiv NaN-from-division limitations + data_generators = { + "normal": generate_normal_data, + "precision_sensitive": generate_precision_sensitive_data, + "subnormal": generate_subnormal_test_data, + "overflow": generate_overflow_test_data, + "boundary": generate_boundary_test_data, + } + + generator = data_generators.get(test_pattern, generate_normal_data) + input1, input2 = generator(shape, dtype) + + # Compute golden reference using numpy (IEEE 754 compliant) + golden = np.zeros(shape, dtype=dtype) + vr, vc = valid_shape + + # Suppress overflow/divide warnings for boundary tests (expected behavior) + with np.errstate(over='ignore', divide='ignore', invalid='ignore'): + golden[:vr, :vc] = (input1[:vr, :vc] / input2[:vr, :vc]).astype(dtype, copy=False) + + save_case_data(case["name"], {"input1": input1, "input2": input2, "golden": golden}) + precision_type = case.get("precision_type", "default") + print(f"[INFO] gen_data: {case['name']} shape={shape} valid_shape={valid_shape} dtype={dtype.__name__} test={test_pattern} precision={precision_type}") diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tdiv/launch.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tdiv/launch.cpp new file mode 100644 index 000000000..79dd0d90a --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tdiv/launch.cpp @@ -0,0 +1,36 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +#include + +#ifndef AICORE +#define AICORE [aicore] +#endif + + +// Case: f32_16x64 + +extern "C" __global__ AICORE void TDIV_f32_16x64(__gm__ float *a, __gm__ float *b, __gm__ float *c); +extern "C" __global__ AICORE void TDIV_f32_16x64_hp_subnormal(__gm__ float *a, __gm__ float *b, __gm__ float *c); +extern "C" __global__ AICORE void TDIV_f16_16x64_hp_partial(__gm__ void *a, __gm__ void *b, __gm__ void *c); + +void LaunchTDIV_f32_16x64_hp_subnormal(float *a, float *b, float *c, void *stream) { + TDIV_f32_16x64_hp_subnormal<<<1, nullptr, stream>>>(a, b, c); +} + + + +void LaunchTDIV_f16_16x64_hp_partial(void *a, void *b, void *c, void *stream) { + TDIV_f16_16x64_hp_partial<<<1, nullptr, stream>>>(a, b, c); +} + + + +void LaunchTDIV_f32_16x64(float *a, float *b, float *c, void *stream) { + TDIV_f32_16x64<<<1, nullptr, stream>>>(a, b, c); +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tdiv/main.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tdiv/main.cpp new file mode 100644 index 000000000..6fdf37aa8 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tdiv/main.cpp @@ -0,0 +1,156 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// Host driver for TileLang tdiv ST — case-table driven. +// Each case launches a different kernel variant, reads/writes from per-case subdirectory. +// Numerical comparison is done externally by compare.cpp. + +#include "acl/acl.h" +#include "test_common.h" +#include +#include +#include +#include +#include +#include + +using namespace PtoTestCommon; + +// Kernel launch wrappers (defined in launch.cpp) +void LaunchTDIV_f32_16x64(float *a, float *b, float *c, void *stream); +void LaunchTDIV_f32_32x32(float *a, float *b, float *c, void *stream); +void LaunchTDIV_f16_16x256(void *a, void *b, void *c, void *stream); +void LaunchTDIV_f16_16x64_hp_precision(void *a, void *b, void *c, void *stream); +void LaunchTDIV_f32_16x64_hp_subnormal(float *a, float *b, float *c, void *stream); +void LaunchTDIV_f16_16x64_hp_subnormal(void *a, void *b, void *c, void *stream); +void LaunchTDIV_f16_16x64_hp_overflow(void *a, void *b, void *c, void *stream); +void LaunchTDIV_f32_64x64_hp(float *a, float *b, float *c, void *stream); +void LaunchTDIV_f32_16x64_hp_partial(float *a, float *b, float *c, void *stream); +void LaunchTDIV_f16_16x64_hp_partial(void *a, void *b, void *c, void *stream); +void LaunchTDIV_f32_2x16_hp(float *a, float *b, float *c, void *stream); + +// Generic launch function type for void* pointers +using LaunchFn = void (*)(void *a, void *b, void *c, void *stream); + +struct TestCase { + const char *name; + LaunchFn launch; + size_t rows; // allocated tile rows + size_t cols; // allocated tile cols + size_t validRows; // effective computation rows (<= rows) + size_t validCols; // effective computation cols (<= cols) + size_t elemSize; // bytes per element +}; + +static const TestCase kCases[] = { +{"f32_16x64", (LaunchFn)LaunchTDIV_f32_16x64, 16, 64, 16, 64, 4}, +{"f32_16x64_hp_subnormal", (LaunchFn)LaunchTDIV_f32_16x64_hp_subnormal, 16, 64, 16, 64, 4}, +{"f16_16x64_hp_partial", (LaunchFn)LaunchTDIV_f16_16x64_hp_partial, 16, 64, 16, 63, 2}, +}; +static constexpr size_t kNumCases = sizeof(kCases) / sizeof(kCases[0]); + +static int RunCase(const TestCase &tc, int deviceId, aclrtStream stream) { + int rc = 0; + const size_t elemCount = tc.rows * tc.cols; + const size_t fileSize = elemCount * tc.elemSize; + + std::printf("[INFO] === case: %s (shape=%zux%zu, valid=%zux%zu) ===\n", + tc.name, tc.rows, tc.cols, tc.validRows, tc.validCols); + + // Per-case data directory + std::string caseDir = std::string("./") + tc.name; + size_t src0FileSize = fileSize; + size_t src1FileSize = fileSize; + + float *src0Host = nullptr, *src1Host = nullptr, *dstHost = nullptr; + float *src0Device = nullptr, *src1Device = nullptr, *dstDevice = nullptr; + + aclrtMallocHost((void **)(&src0Host), fileSize); + aclrtMallocHost((void **)(&src1Host), fileSize); + aclrtMallocHost((void **)(&dstHost), fileSize); + + aclrtMalloc((void **)&src0Device, fileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc((void **)&src1Device, fileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc((void **)&dstDevice, fileSize, ACL_MEM_MALLOC_HUGE_FIRST); + + if (!ReadFile((caseDir + "/input1.bin").c_str(), src0FileSize, src0Host, fileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input1.bin\n", caseDir.c_str()); + rc = 1; + } + if (rc == 0 && !ReadFile((caseDir + "/input2.bin").c_str(), src1FileSize, src1Host, fileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input2.bin\n", caseDir.c_str()); + rc = 1; + } + + if (rc == 0) { + aclrtMemcpy(src0Device, fileSize, src0Host, fileSize, ACL_MEMCPY_HOST_TO_DEVICE); + aclrtMemcpy(src1Device, fileSize, src1Host, fileSize, ACL_MEMCPY_HOST_TO_DEVICE); + + tc.launch(src0Device, src1Device, dstDevice, stream); + + aclrtSynchronizeStream(stream); + aclrtMemcpy(dstHost, fileSize, dstDevice, fileSize, ACL_MEMCPY_DEVICE_TO_HOST); + } + + if (rc == 0 && !WriteFile((caseDir + "/output.bin").c_str(), dstHost, fileSize)) { + std::fprintf(stderr, "[ERROR] failed to write %s/output.bin\n", caseDir.c_str()); + rc = 1; + } + + if (src0Device != nullptr) + aclrtFree(src0Device); + if (src1Device != nullptr) + aclrtFree(src1Device); + if (dstDevice != nullptr) + aclrtFree(dstDevice); + if (src0Host != nullptr) + aclrtFreeHost(src0Host); + if (src1Host != nullptr) + aclrtFreeHost(src1Host); + if (dstHost != nullptr) + aclrtFreeHost(dstHost); + + if (rc == 0) + std::printf("[INFO] case %s done\n", tc.name); + return rc; +} + +int main(int argc, char *argv[]) { + // Optional case filter: ./tdiv [case_name] + const char *caseFilter = (argc > 1) ? argv[1] : nullptr; + + int rc = 0; + int deviceId = 0; + aclrtStream stream = nullptr; + + aclInit(nullptr); + if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { + deviceId = std::atoi(envDevice); + } + aclrtSetDevice(deviceId); + aclrtCreateStream(&stream); + + for (size_t i = 0; i < kNumCases; ++i) { + if (caseFilter != nullptr && std::strcmp(kCases[i].name, caseFilter) != 0) { + continue; + } + int ret = RunCase(kCases[i], deviceId, stream); + if (ret != 0) { + std::fprintf(stderr, "[ERROR] case %s failed\n", kCases[i].name); + rc = 1; + break; + } + } + + if (stream != nullptr) + aclrtDestroyStream(stream); + aclrtResetDevice(deviceId); + aclFinalize(); + + return rc; +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tdiv/tdiv.pto b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tdiv/tdiv.pto new file mode 100644 index 000000000..7fea96b8c --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tdiv/tdiv.pto @@ -0,0 +1,187 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// TileLang ST kernels for pto.tdiv: tload(a) + tload(b) + tdiv(a,b)->c + tstore(c). +// Multiple cases with different shapes in a single module. +// Compiled by ptoas --enable-insert-sync --enable-tile-op-expand --pto-backend=vpto +// to produce a fatobj object. + +module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { + // Case 0: f32 16x64 (1024 elements) + func.func @TDIV_f32_16x64(%a_ptr: !pto.ptr, %b_ptr: !pto.ptr, %c_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c16 = arith.constant 16 : index + %c64 = arith.constant 64 : index + %c1024 = arith.constant 1024 : index + + %a_view = pto.make_tensor_view %a_ptr, + shape = [%c1, %c1, %c1, %c16, %c64], + strides = [%c1024, %c1024, %c1024, %c64, %c1] + : !pto.tensor_view<1x1x1x16x64xf32> + %b_view = pto.make_tensor_view %b_ptr, + shape = [%c1, %c1, %c1, %c16, %c64], + strides = [%c1024, %c1024, %c1024, %c64, %c1] + : !pto.tensor_view<1x1x1x16x64xf32> + %c_view = pto.make_tensor_view %c_ptr, + shape = [%c1, %c1, %c1, %c16, %c64], + strides = [%c1024, %c1024, %c1024, %c64, %c1] + : !pto.tensor_view<1x1x1x16x64xf32> + + %a_part = pto.partition_view %a_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c16, %c64] + : !pto.tensor_view<1x1x1x16x64xf32> -> !pto.partition_tensor_view<1x1x1x16x64xf32> + %b_part = pto.partition_view %b_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c16, %c64] + : !pto.tensor_view<1x1x1x16x64xf32> -> !pto.partition_tensor_view<1x1x1x16x64xf32> + %c_part = pto.partition_view %c_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c16, %c64] + : !pto.tensor_view<1x1x1x16x64xf32> -> !pto.partition_tensor_view<1x1x1x16x64xf32> + + %a = pto.alloc_tile + : !pto.tile_buf + %b = pto.alloc_tile + : !pto.tile_buf + %c = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%a_part : !pto.partition_tensor_view<1x1x1x16x64xf32>) + outs(%a : !pto.tile_buf) + pto.tload ins(%b_part : !pto.partition_tensor_view<1x1x1x16x64xf32>) + outs(%b : !pto.tile_buf) + + pto.tdiv ins(%a, %b : !pto.tile_buf, + !pto.tile_buf) + outs(%c : !pto.tile_buf) + + pto.tstore ins(%c : !pto.tile_buf) + outs(%c_part : !pto.partition_tensor_view<1x1x1x16x64xf32>) + return + } + + // Case 1: f32 32x32 (1024 elements) + + func.func @TDIV_f32_16x64_hp_subnormal(%a_ptr: !pto.ptr, %b_ptr: !pto.ptr, %c_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c16 = arith.constant 16 : index + %c64 = arith.constant 64 : index + %c1024 = arith.constant 1024 : index + + %a_view = pto.make_tensor_view %a_ptr, + shape = [%c1, %c1, %c1, %c16, %c64], + strides = [%c1024, %c1024, %c1024, %c64, %c1] + : !pto.tensor_view<1x1x1x16x64xf32> + %b_view = pto.make_tensor_view %b_ptr, + shape = [%c1, %c1, %c1, %c16, %c64], + strides = [%c1024, %c1024, %c1024, %c64, %c1] + : !pto.tensor_view<1x1x1x16x64xf32> + %c_view = pto.make_tensor_view %c_ptr, + shape = [%c1, %c1, %c1, %c16, %c64], + strides = [%c1024, %c1024, %c1024, %c64, %c1] + : !pto.tensor_view<1x1x1x16x64xf32> + + %a_part = pto.partition_view %a_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c16, %c64] + : !pto.tensor_view<1x1x1x16x64xf32> -> !pto.partition_tensor_view<1x1x1x16x64xf32> + %b_part = pto.partition_view %b_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c16, %c64] + : !pto.tensor_view<1x1x1x16x64xf32> -> !pto.partition_tensor_view<1x1x1x16x64xf32> + %c_part = pto.partition_view %c_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c16, %c64] + : !pto.tensor_view<1x1x1x16x64xf32> -> !pto.partition_tensor_view<1x1x1x16x64xf32> + + %a = pto.alloc_tile + : !pto.tile_buf + %b = pto.alloc_tile + : !pto.tile_buf + %c = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%a_part : !pto.partition_tensor_view<1x1x1x16x64xf32>) + outs(%a : !pto.tile_buf) + pto.tload ins(%b_part : !pto.partition_tensor_view<1x1x1x16x64xf32>) + outs(%b : !pto.tile_buf) + + pto.tdiv ins(%a, %b : !pto.tile_buf, + !pto.tile_buf) + outs(%c : !pto.tile_buf) + + {precisionType = #pto} + + pto.tstore ins(%c : !pto.tile_buf) + outs(%c_part : !pto.partition_tensor_view<1x1x1x16x64xf32>) + return + } + + // Case: f16_16x64_hp_subnormal + + func.func @TDIV_f16_16x64_hp_partial(%a_ptr: !pto.ptr, %b_ptr: !pto.ptr, %c_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c16 = arith.constant 16 : index + %c64 = arith.constant 64 : index + %c1024 = arith.constant 1024 : index + + %a_view = pto.make_tensor_view %a_ptr, + shape = [%c1, %c1, %c1, %c16, %c64], + strides = [%c1024, %c1024, %c1024, %c64, %c1] + : !pto.tensor_view<1x1x1x16x64xf16> + %b_view = pto.make_tensor_view %b_ptr, + shape = [%c1, %c1, %c1, %c16, %c64], + strides = [%c1024, %c1024, %c1024, %c64, %c1] + : !pto.tensor_view<1x1x1x16x64xf16> + %c_view = pto.make_tensor_view %c_ptr, + shape = [%c1, %c1, %c1, %c16, %c64], + strides = [%c1024, %c1024, %c1024, %c64, %c1] + : !pto.tensor_view<1x1x1x16x64xf16> + + %a_part = pto.partition_view %a_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c16, %c64] + : !pto.tensor_view<1x1x1x16x64xf16> -> !pto.partition_tensor_view<1x1x1x16x64xf16> + %b_part = pto.partition_view %b_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c16, %c64] + : !pto.tensor_view<1x1x1x16x64xf16> -> !pto.partition_tensor_view<1x1x1x16x64xf16> + %c_part = pto.partition_view %c_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c16, %c64] + : !pto.tensor_view<1x1x1x16x64xf16> -> !pto.partition_tensor_view<1x1x1x16x64xf16> + + %a = pto.alloc_tile + : !pto.tile_buf + %b = pto.alloc_tile + : !pto.tile_buf + %c = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%a_part : !pto.partition_tensor_view<1x1x1x16x64xf16>) + outs(%a : !pto.tile_buf) + pto.tload ins(%b_part : !pto.partition_tensor_view<1x1x1x16x64xf16>) + outs(%b : !pto.tile_buf) + + pto.tdiv ins(%a, %b : !pto.tile_buf, + !pto.tile_buf) + outs(%c : !pto.tile_buf) + + {precisionType = #pto} + + pto.tstore ins(%c : !pto.tile_buf) + outs(%c_part : !pto.partition_tensor_view<1x1x1x16x64xf16>) + return + } + + // Case: f32_2x16_hp +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tdivs/CMakeLists.txt b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tdivs/CMakeLists.txt new file mode 100644 index 000000000..cfd816f61 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tdivs/CMakeLists.txt @@ -0,0 +1,9 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +pto_tilelang_vec_st(tdivs) diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tdivs/cases.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tdivs/cases.py new file mode 100644 index 000000000..878a22f82 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tdivs/cases.py @@ -0,0 +1,252 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +"""Single source of truth for tdivs ST test cases. + +vdiv only supports f16/f32 in TileLang DSL v1. + +Each case defines: + - name: case identifier, used as subdirectory name and by main.cpp kCases[]. + - dtype: numpy dtype (e.g. np.float32). + - shape: (rows, cols) — allocated tile dimensions. + - valid_shape: (valid_rows, valid_cols) — effective computation region. + - eps: tolerance for numpy.allclose (atol and rtol). + - direction: "src_scalar" (src / scalar) or "scalar_src" (scalar / src) + - precision_type: optional, "default" or "high_precision". + - test_pattern: optional, "normal", "precision_sensitive", "subnormal", "overflow" + +gen_data.py and compare.py both import this list to avoid redundant definitions. +""" + +import numpy as np + +CASES = [ + # ============================================================ + # Normal cases - basic functionality (DEFAULT precision mode) + # ============================================================ + # src / scalar direction + { + "name": "f32_32x64", + "dtype": np.float32, + "shape": (32, 64), + "valid_shape": (32, 64), + "eps": 1e-6, + "direction": "src_scalar", + }, + { + "name": "f16_63x64", + "dtype": np.float16, + "shape": (63, 64), + "valid_shape": (63, 64), + "eps": 1e-3, + "direction": "src_scalar", + }, + { + "name": "f32_7x448", + "dtype": np.float32, + "shape": (7, 448), + "valid_shape": (7, 448), + "eps": 1e-6, + "direction": "src_scalar", + }, + { + "name": "f32_256x16", + "dtype": np.float32, + "shape": (256, 16), + "valid_shape": (256, 16), + "eps": 1e-6, + "direction": "src_scalar", + }, + # scalar / src direction + { + "name": "f32_32x64_scalar_src", + "dtype": np.float32, + "shape": (32, 64), + "valid_shape": (32, 64), + "eps": 1e-6, + "direction": "scalar_src", + }, + { + "name": "f16_63x64_scalar_src", + "dtype": np.float16, + "shape": (63, 64), + "valid_shape": (63, 64), + "eps": 1e-3, + "direction": "scalar_src", + }, + { + "name": "f32_7x448_scalar_src", + "dtype": np.float32, + "shape": (7, 448), + "valid_shape": (7, 448), + "eps": 1e-6, + "direction": "scalar_src", + }, + { + "name": "f32_256x16_scalar_src", + "dtype": np.float32, + "shape": (256, 16), + "valid_shape": (256, 16), + "eps": 1e-6, + "direction": "scalar_src", + }, + + # ============================================================ + # HIGH_PRECISION mode - src / scalar direction + # ============================================================ + # Precision-sensitive ratios + { + "name": "f32_32x64_hp", + "dtype": np.float32, + "shape": (32, 64), + "valid_shape": (32, 64), + "eps": 1e-6, + "precision_type": "high_precision", + "direction": "src_scalar", + "test_pattern": "precision_sensitive", + "ulp_tolerance": 1, + }, + { + "name": "f16_63x64_hp", + "dtype": np.float16, + "shape": (63, 64), + "valid_shape": (63, 64), + "eps": 1e-3, + "precision_type": "high_precision", + "direction": "src_scalar", + "test_pattern": "precision_sensitive", + "ulp_tolerance": 1, + }, + + # Subnormal numbers + { + "name": "f32_16x64_hp_subnormal", + "dtype": np.float32, + "shape": (16, 64), + "valid_shape": (16, 64), + "eps": 1e-6, + "precision_type": "high_precision", + "direction": "src_scalar", + "test_pattern": "subnormal", + "ulp_tolerance": 2, + }, + { + "name": "f16_16x64_hp_subnormal", + "dtype": np.float16, + "shape": (16, 64), + "valid_shape": (16, 64), + "eps": 1e-3, + "precision_type": "high_precision", + "direction": "src_scalar", + "test_pattern": "subnormal", + "ulp_tolerance": 2, + }, + + # Overflow/Underflow boundaries + { + "name": "f32_16x64_hp_overflow", + "dtype": np.float32, + "shape": (16, 64), + "valid_shape": (16, 64), + "eps": 1e-6, + "precision_type": "high_precision", + "direction": "src_scalar", + "test_pattern": "overflow", + }, + { + "name": "f16_16x64_hp_overflow", + "dtype": np.float16, + "shape": (16, 64), + "valid_shape": (16, 64), + "eps": 1e-3, + "precision_type": "high_precision", + "direction": "src_scalar", + "test_pattern": "overflow", + }, + + # ============================================================ + # HIGH_PRECISION mode - scalar / src direction + # ============================================================ + { + "name": "f32_32x64_hp_scalar_src", + "dtype": np.float32, + "shape": (32, 64), + "valid_shape": (32, 64), + "eps": 1e-6, + "precision_type": "high_precision", + "direction": "scalar_src", + "test_pattern": "precision_sensitive", + "ulp_tolerance": 1, + }, + { + "name": "f16_63x64_hp_scalar_src", + "dtype": np.float16, + "shape": (63, 64), + "valid_shape": (63, 64), + "eps": 1e-3, + "precision_type": "high_precision", + "direction": "scalar_src", + "test_pattern": "precision_sensitive", + "ulp_tolerance": 1, + }, + + # Subnormal - scalar / src (scalar is normal, src contains subnormals) + { + "name": "f32_16x64_hp_subnormal_scalar_src", + "dtype": np.float32, + "shape": (16, 64), + "valid_shape": (16, 64), + "eps": 1e-6, + "precision_type": "high_precision", + "direction": "scalar_src", + "test_pattern": "subnormal", + "ulp_tolerance": 2, + }, + { + "name": "f16_16x64_hp_subnormal_scalar_src", + "dtype": np.float16, + "shape": (16, 64), + "valid_shape": (16, 64), + "eps": 1e-3, + "precision_type": "high_precision", + "direction": "scalar_src", + "test_pattern": "subnormal", + "ulp_tolerance": 2, + }, + + # Overflow - scalar / src (division by small src values) + { + "name": "f32_16x64_hp_overflow_scalar_src", + "dtype": np.float32, + "shape": (16, 64), + "valid_shape": (16, 64), + "eps": 1e-6, + "precision_type": "high_precision", + "direction": "scalar_src", + "test_pattern": "overflow", + }, + { + "name": "f16_16x64_hp_overflow_scalar_src", + "dtype": np.float16, + "shape": (16, 64), + "valid_shape": (16, 64), + "eps": 1e-3, + "precision_type": "high_precision", + "direction": "scalar_src", + "test_pattern": "overflow", + }, +] + +_SMOKE_CASE_NAMES = ['f32_32x64', 'f32_16x64_hp_overflow', 'f16_16x64_hp_subnormal_scalar_src'] +_SMOKE_CASE_NAME_SET = set(_SMOKE_CASE_NAMES) +_missing = [name for name in _SMOKE_CASE_NAMES if name not in {case["name"] for case in CASES}] +if _missing: + raise RuntimeError("unknown smoke case(s): " + ", ".join(_missing)) +CASES = [case for case in CASES if case["name"] in _SMOKE_CASE_NAME_SET] diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tdivs/compare.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tdivs/compare.py new file mode 100644 index 000000000..50186777e --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tdivs/compare.py @@ -0,0 +1,46 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import os +import sys +import numpy as np + +from cases import CASES +from st_common import result_cmp, style_fail, style_pass, validate_cases + +def main(): + validate_cases(CASES) + case_filter = sys.argv[1] if len(sys.argv) > 1 else None + + all_passed = True + for case in CASES: + if case_filter is not None and case["name"] != case_filter: + continue + + case_dir = case["name"] + shape = case["shape"] + vr, vc = case["valid_shape"] + + golden = np.fromfile(os.path.join(case_dir, "golden.bin"), dtype=case["dtype"]).reshape(shape) + output = np.fromfile(os.path.join(case_dir, "output.bin"), dtype=case["dtype"]).reshape(shape) + + ok = result_cmp(golden[:vr, :vc], output[:vr, :vc], case["eps"]) + if ok: + print(style_pass(f"[INFO] {case['name']}: compare passed")) + else: + print(style_fail(f"[ERROR] {case['name']}: compare failed")) + all_passed = False + + if not all_passed: + sys.exit(2) + print(style_pass("[INFO] all cases passed")) + +if __name__ == "__main__": + main() diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tdivs/gen_data.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tdivs/gen_data.py new file mode 100644 index 000000000..95e28c8d3 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tdivs/gen_data.py @@ -0,0 +1,247 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import sys +import os +from pathlib import Path + +# Add current directory to path for standalone execution +script_dir = Path(__file__).parent +if script_dir not in sys.path: + sys.path.insert(0, str(script_dir)) + +# Add st_common directory +st_common_dir = script_dir.parent +if st_common_dir not in sys.path: + sys.path.insert(0, str(st_common_dir)) + +import numpy as np +from cases import CASES +from st_common import validate_cases, setup_case_rng, save_case_data + +validate_cases(CASES) + +# Default scalar value for division (matches the scalar passed in launch.cpp) +DEFAULT_SCALAR = 3.0 + + +def generate_precision_sensitive_scalar(shape, dtype, direction): + """Generate precision-sensitive test data for scalar division. + + Uses scalar values that create precision-sensitive ratios when divided + with tile data (e.g., 1/3, 1/7 patterns). + """ + rows, cols = shape + + # For src / scalar: tile contains precision-sensitive values + # For scalar / src: scalar is precision-sensitive, src contains small integers + if direction == "src_scalar": + # Tile contains values like 1, 7, 5, 10 etc divided by scalar 3 + # Results: 1/3, 7/3, 5/3, 10/3 - precision-sensitive + input1 = np.zeros(shape, dtype=dtype) + values = [1, 7, 5, 10, 1, 3, 2, 11] + section_size = rows // len(values) + for i, v in enumerate(values): + start_row = i * section_size + end_row = min((i + 1) * section_size, rows) + input1[start_row:end_row, :] = dtype(v) + scalar = dtype(DEFAULT_SCALAR) + else: # scalar_src + # Scalar is 1, tile contains 3, 7, etc -> 1/3, 1/7 precision-sensitive + input1 = np.full(shape, dtype(3), dtype=dtype) # Avoid zeros + # Fill with divisor values that create precision-sensitive ratios + values = [3, 7, 11, 3, 5, 7, 11, 3] + section_size = rows // len(values) + for i, v in enumerate(values): + start_row = i * section_size + end_row = min((i + 1) * section_size, rows) + input1[start_row:end_row, :] = dtype(v) + scalar = dtype(1.0) + + return input1, scalar + + +def generate_subnormal_test_data(shape, dtype, direction): + """Generate subnormal (denormal) numbers for scalar division tests. + + For src / scalar: + - src contains subnormal values, scalar is normal + - Tests subnormal dividend handling + + For scalar / src: + - scalar is normal, src contains subnormal values + - Tests subnormal divisor handling (can produce large results) + """ + rows, cols = shape + + if dtype == np.float32: + subnormal_max = np.frombuffer(np.array([0x007FFFFF], dtype=np.uint32), dtype=np.float32)[0] + subnormal_min = np.float32(1e-45) + normal_min = np.float32(1e-38) * np.float32(2.0) # smallest normal + else: # float16 + subnormal_max = np.frombuffer(np.array([0x03FF], dtype=np.uint16), dtype=np.float16)[0] + subnormal_min = np.float16(1e-8) + normal_min = np.float16(6e-5) * np.float16(2.0) + + if direction == "src_scalar": + # src contains subnormal values, scalar is normal (e.g., 10) + input1 = np.zeros(shape, dtype=dtype) + quarter = rows // 4 + + # Section 1: MAX_SUBNORMAL / normal -> tiny normal result + input1[:quarter, :] = subnormal_max + + # Section 2: Mid-range subnormal / normal + input1[quarter:2*quarter, :] = np.random.uniform( + subnormal_min, subnormal_max, size=(quarter, cols)).astype(dtype) + + # Section 3: Smallest subnormal / normal + input1[2*quarter:3*quarter, :] = subnormal_min + + # Section 4: Normal reference + input1[3*quarter:, :] = np.random.uniform(0.1, 100.0, size=(rows-3*quarter, cols)).astype(dtype) + + scalar = dtype(10.0) + else: # scalar_src + # scalar is normal (e.g., 1e-20 for f32), src contains subnormal + # This tests: normal / subnormal -> large result (potential overflow) + input1 = np.zeros(shape, dtype=dtype) + quarter = rows // 4 + + # Section 1: normal / MAX_SUBNORMAL -> large but not overflow + input1[:quarter, :] = subnormal_max + + # Section 2: normal / mid subnormal -> larger + input1[quarter:2*quarter, :] = np.random.uniform( + subnormal_max * 0.1, subnormal_max, size=(quarter, cols)).astype(dtype) + + # Section 3: normal / tiny subnormal -> very large (near overflow) + input1[2*quarter:3*quarter, :] = np.random.uniform( + subnormal_min, subnormal_max * 0.1, size=(quarter, cols)).astype(dtype) + + # Section 4: Normal reference + input1[3*quarter:, :] = np.random.uniform(0.1, 100.0, size=(rows-3*quarter, cols)).astype(dtype) + + # Use a small normal scalar that won't overflow when divided by smallest subnormal + if dtype == np.float32: + scalar = np.float32(1e-20) # Safe: 1e-20 / 1e-45 = 1e25, within f32 range + else: + scalar = np.float16(1e-5) # Safe: 1e-5 / 1e-8 = 1000, within f16 range + + return input1, scalar + + +def generate_overflow_test_data(shape, dtype, direction): + """Generate overflow/underflow boundary values for scalar division tests. + + For src / scalar: + - Large src / tiny scalar -> overflow + - Tiny src / large scalar -> underflow + + For scalar / src: + - Large scalar / tiny src -> overflow + - Tiny scalar / large src -> underflow + """ + rows, cols = shape + + if dtype == np.float32: + large_val = np.float32(1e30) + tiny_val = np.float32(1e-30) + overflow_trigger = np.float32(1e38) + underflow_trigger = np.float32(1e-45) + else: # float16 + large_val = np.float16(60000) + tiny_val = np.float16(0.0001) + overflow_trigger = np.float16(65000) + underflow_trigger = np.float16(1e-7) + + if direction == "src_scalar": + input1 = np.zeros(shape, dtype=dtype) + quarter = rows // 4 + + # Section 1: Overflow - large / tiny + input1[:quarter, :] = overflow_trigger + + # Section 2: Near overflow boundary + input1[quarter:2*quarter, :] = np.random.uniform(large_val, overflow_trigger, + size=(quarter, cols)).astype(dtype) + + # Section 3: Underflow - tiny / large + input1[2*quarter:3*quarter, :] = underflow_trigger + + # Section 4: Normal reference + input1[3*quarter:, :] = np.random.uniform(0.1, 100.0, size=(rows-3*quarter, cols)).astype(dtype) + + scalar = dtype(tiny_val) # Tiny scalar triggers overflow + + else: # scalar_src + input1 = np.zeros(shape, dtype=dtype) + quarter = rows // 4 + + # Section 1: Overflow - scalar / tiny src + input1[:quarter, :] = tiny_val # Tiny divisor + + # Section 2: Near overflow boundary + input1[quarter:2*quarter, :] = np.random.uniform( + tiny_val/10, tiny_val, size=(quarter, cols)).astype(dtype) + + # Section 3: Underflow - scalar / large src + input1[2*quarter:3*quarter, :] = large_val + + # Section 4: Normal reference + input1[3*quarter:, :] = np.random.uniform(0.1, 100.0, size=(rows-3*quarter, cols)).astype(dtype) + + # Large scalar triggers overflow when divided by tiny src + scalar = dtype(overflow_trigger) + + return input1, scalar + + +def generate_normal_data(shape, dtype, direction): + """Generate simple random values for normal testing.""" + input1 = np.random.randint(1, 10, size=shape).astype(dtype) + scalar = dtype(DEFAULT_SCALAR) + return input1, scalar + + +for case in CASES: + setup_case_rng(case) + + dtype = case["dtype"] + shape = case["shape"] + valid_shape = case["valid_shape"] + direction = case.get("direction", "src_scalar") + test_pattern = case.get("test_pattern", "normal") + + # Generate test data based on pattern and direction + data_generators = { + "normal": generate_normal_data, + "precision_sensitive": generate_precision_sensitive_scalar, + "subnormal": generate_subnormal_test_data, + "overflow": generate_overflow_test_data, + } + + generator = data_generators.get(test_pattern, generate_normal_data) + input1, scalar_val = generator(shape, dtype, direction) + + # Compute golden reference using numpy (IEEE 754 compliant) + golden = np.zeros(shape, dtype=dtype) + vr, vc = valid_shape + + # Suppress overflow/divide warnings for boundary tests (expected behavior) + with np.errstate(over='ignore', divide='ignore', invalid='ignore'): + if direction == "src_scalar": + golden[:vr, :vc] = (input1[:vr, :vc] / scalar_val).astype(dtype, copy=False) + else: # scalar_src + golden[:vr, :vc] = (scalar_val / input1[:vr, :vc]).astype(dtype, copy=False) + + save_case_data(case["name"], {"input1": input1, "golden": golden}) + precision_type = case.get("precision_type", "default") + print(f"[INFO] gen_data: {case['name']} shape={shape} valid_shape={valid_shape} dtype={dtype.__name__} direction={direction} test={test_pattern} precision={precision_type} scalar={scalar_val}") diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tdivs/launch.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tdivs/launch.cpp new file mode 100644 index 000000000..5d3da42b1 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tdivs/launch.cpp @@ -0,0 +1,47 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +#include +#include + +#ifndef AICORE +#define AICORE [aicore] +#endif + +static constexpr float TDIVS_SCALAR_F32 = 3.0f; + +// Helper to convert IEEE 754 hex bits to float (runtime initialization) +inline float bits_to_float(uint32_t bits) { + float result; + memcpy(&result, &bits, sizeof(float)); + return result; +} + +// ========== src / scalar direction ========== + +// Case 0: f32 32x64 + +extern "C" __global__ AICORE void TDIVS_f32_32x64(__gm__ float *src, __gm__ float *dst, float scalar); +extern "C" __global__ AICORE void TDIVS_f32_16x64_hp_overflow(__gm__ float *src, __gm__ float *dst, float scalar); +extern "C" __global__ AICORE void TDIVS_f16_16x64_hp_subnormal_scalar_src(__gm__ unsigned short *src, __gm__ unsigned short *dst, unsigned short scalar); + +void LaunchTDIVS_f32_16x64_hp_overflow(float *src, float *dst, void *stream) { + TDIVS_f32_16x64_hp_overflow<<<1, nullptr, stream>>>((__gm__ float *)src, (__gm__ float *)dst, bits_to_float(0x0DA24260U)); +} + + + +void LaunchTDIVS_f32_32x64(float *src, float *dst, void *stream) { + TDIVS_f32_32x64<<<1, nullptr, stream>>>((__gm__ float *)src, (__gm__ float *)dst, TDIVS_SCALAR_F32); +} + + + +void LaunchTDIVS_f16_16x64_hp_subnormal_scalar_src(unsigned short *src, unsigned short *dst, void *stream) { + TDIVS_f16_16x64_hp_subnormal_scalar_src<<<1, nullptr, stream>>>((__gm__ unsigned short *)src, (__gm__ unsigned short *)dst, (unsigned short)0x00A8); +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tdivs/main.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tdivs/main.cpp new file mode 100644 index 000000000..fa4f40d2a --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tdivs/main.cpp @@ -0,0 +1,143 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// Host driver for TileLang tdivs ST — case-table driven. +// tdivs: dst = src / scalar (single input + scalar). +// Numerical comparison is done externally by compare.py. + +#include "acl/acl.h" +#include "test_common.h" +#include +#include +#include +#include +#include +#include + +using namespace PtoTestCommon; + +// Kernel launch wrappers (defined in launch.cpp) +void LaunchTDIVS_f32_32x64(float *src, float *dst, void *stream); +void LaunchTDIVS_f16_63x64(uint16_t *src, uint16_t *dst, void *stream); +void LaunchTDIVS_f32_256x16(float *src, float *dst, void *stream); +void LaunchTDIVS_f16_63x64_scalar_src(uint16_t *src, uint16_t *dst, void *stream); +void LaunchTDIVS_f32_256x16_scalar_src(float *src, float *dst, void *stream); +// HIGH_PRECISION mode kernels +void LaunchTDIVS_f16_63x64_hp(uint16_t *src, uint16_t *dst, void *stream); +void LaunchTDIVS_f16_16x64_hp_subnormal(uint16_t *src, uint16_t *dst, void *stream); +void LaunchTDIVS_f32_16x64_hp_overflow(float *src, float *dst, void *stream); +void LaunchTDIVS_f16_16x64_hp_overflow(uint16_t *src, uint16_t *dst, void *stream); +void LaunchTDIVS_f16_63x64_hp_scalar_src(uint16_t *src, uint16_t *dst, void *stream); +void LaunchTDIVS_f16_16x64_hp_subnormal_scalar_src(uint16_t *src, uint16_t *dst, void *stream); +void LaunchTDIVS_f16_16x64_hp_overflow_scalar_src(uint16_t *src, uint16_t *dst, void *stream); + +struct TestCase { + const char *name; + void (*launch)(void *, void *, void *); // src, dst, stream + size_t rows; // allocated tile rows + size_t cols; // allocated tile cols + size_t validRows; // effective computation rows (<= rows) + size_t validCols; // effective computation cols (<= cols) + size_t elemSize; // bytes per element +}; + +static const TestCase kCases[] = { +{"f32_32x64", (void (*)(void*,void*,void*))LaunchTDIVS_f32_32x64, 32, 64, 32, 64, sizeof(float)}, +{"f32_16x64_hp_overflow", (void (*)(void*,void*,void*))LaunchTDIVS_f32_16x64_hp_overflow, 16, 64, 16, 64, sizeof(float)}, +{"f16_16x64_hp_subnormal_scalar_src", (void (*)(void*,void*,void*))LaunchTDIVS_f16_16x64_hp_subnormal_scalar_src, 16, 64, 16, 64, sizeof(uint16_t)}, +}; +static constexpr size_t kNumCases = sizeof(kCases) / sizeof(kCases[0]); + +static int RunCase(const TestCase &tc, int deviceId, aclrtStream stream) { + int rc = 0; + const size_t elemCount = tc.rows * tc.cols; + const size_t fileSize = elemCount * tc.elemSize; + + std::printf("[INFO] === case: %s (shape=%zux%zu, valid=%zux%zu) ===\n", + tc.name, tc.rows, tc.cols, tc.validRows, tc.validCols); + + // Per-case data directory + std::string caseDir = std::string("./") + tc.name; + size_t srcFileSize = fileSize; + + void *srcHost = nullptr, *dstHost = nullptr; + void *srcDevice = nullptr, *dstDevice = nullptr; + + aclrtMallocHost(&srcHost, fileSize); + aclrtMallocHost(&dstHost, fileSize); + + aclrtMalloc(&srcDevice, fileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc(&dstDevice, fileSize, ACL_MEM_MALLOC_HUGE_FIRST); + + if (!ReadFile((caseDir + "/input1.bin").c_str(), srcFileSize, srcHost, fileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input1.bin\n", caseDir.c_str()); + rc = 1; + } + + if (rc == 0) { + aclrtMemcpy(srcDevice, fileSize, srcHost, fileSize, ACL_MEMCPY_HOST_TO_DEVICE); + + tc.launch(srcDevice, dstDevice, stream); + + aclrtSynchronizeStream(stream); + aclrtMemcpy(dstHost, fileSize, dstDevice, fileSize, ACL_MEMCPY_DEVICE_TO_HOST); + } + + if (rc == 0 && !WriteFile((caseDir + "/output.bin").c_str(), dstHost, fileSize)) { + std::fprintf(stderr, "[ERROR] failed to write %s/output.bin\n", caseDir.c_str()); + rc = 1; + } + + if (srcDevice != nullptr) + aclrtFree(srcDevice); + if (dstDevice != nullptr) + aclrtFree(dstDevice); + if (srcHost != nullptr) + aclrtFreeHost(srcHost); + if (dstHost != nullptr) + aclrtFreeHost(dstHost); + + if (rc == 0) + std::printf("[INFO] case %s done\n", tc.name); + return rc; +} + +int main(int argc, char *argv[]) { + // Optional case filter: ./tdivs [case_name] + const char *caseFilter = (argc > 1) ? argv[1] : nullptr; + + int rc = 0; + int deviceId = 0; + aclrtStream stream = nullptr; + + aclInit(nullptr); + if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { + deviceId = std::atoi(envDevice); + } + aclrtSetDevice(deviceId); + aclrtCreateStream(&stream); + + for (size_t i = 0; i < kNumCases; ++i) { + if (caseFilter != nullptr && std::strcmp(kCases[i].name, caseFilter) != 0) { + continue; + } + int ret = RunCase(kCases[i], deviceId, stream); + if (ret != 0) { + std::fprintf(stderr, "[ERROR] case %s failed\n", kCases[i].name); + rc = 1; + break; + } + } + + if (stream != nullptr) + aclrtDestroyStream(stream); + aclrtResetDevice(deviceId); + aclFinalize(); + + return rc; +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tdivs/tdivs.pto b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tdivs/tdivs.pto new file mode 100644 index 000000000..c91f65937 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tdivs/tdivs.pto @@ -0,0 +1,73 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// TileLang ST kernels for pto.tdivs: tload(src) + tdivs(src, scalar)->dst + tstore(dst). +// vdiv only supports f16/f32 in TileLang DSL v1. +module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { + + // Case 0: f32 32x64 + func.func @TDIVS_f32_32x64(%src_ptr: !pto.ptr, %dst_ptr: !pto.ptr, %scalar: f32) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c32 = arith.constant 32 : index + %c64 = arith.constant 64 : index + %c2048 = arith.constant 2048 : index + %src_view = pto.make_tensor_view %src_ptr, shape = [%c1, %c1, %c1, %c32, %c64], strides = [%c2048, %c2048, %c2048, %c64, %c1] : !pto.tensor_view<1x1x1x32x64xf32> + %dst_view = pto.make_tensor_view %dst_ptr, shape = [%c1, %c1, %c1, %c32, %c64], strides = [%c2048, %c2048, %c2048, %c64, %c1] : !pto.tensor_view<1x1x1x32x64xf32> + %src_part = pto.partition_view %src_view, offsets = [%c0, %c0, %c0, %c0, %c0], sizes = [%c1, %c1, %c1, %c32, %c64] : !pto.tensor_view<1x1x1x32x64xf32> -> !pto.partition_tensor_view<1x1x1x32x64xf32> + %dst_part = pto.partition_view %dst_view, offsets = [%c0, %c0, %c0, %c0, %c0], sizes = [%c1, %c1, %c1, %c32, %c64] : !pto.tensor_view<1x1x1x32x64xf32> -> !pto.partition_tensor_view<1x1x1x32x64xf32> + %src = pto.alloc_tile : !pto.tile_buf + %dst = pto.alloc_tile : !pto.tile_buf + pto.tload ins(%src_part : !pto.partition_tensor_view<1x1x1x32x64xf32>) outs(%src : !pto.tile_buf) + pto.tdivs ins(%src, %scalar : !pto.tile_buf, f32) outs(%dst : !pto.tile_buf) + pto.tstore ins(%dst : !pto.tile_buf) outs(%dst_part : !pto.partition_tensor_view<1x1x1x32x64xf32>) + return + } + + // Case 1: f16 63x64 + + func.func @TDIVS_f32_16x64_hp_overflow(%src_ptr: !pto.ptr, %dst_ptr: !pto.ptr, %scalar: f32) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c16 = arith.constant 16 : index + %c64 = arith.constant 64 : index + %c1024 = arith.constant 1024 : index + %src_view = pto.make_tensor_view %src_ptr, shape = [%c1, %c1, %c1, %c16, %c64], strides = [%c1024, %c1024, %c1024, %c64, %c1] : !pto.tensor_view<1x1x1x16x64xf32> + %dst_view = pto.make_tensor_view %dst_ptr, shape = [%c1, %c1, %c1, %c16, %c64], strides = [%c1024, %c1024, %c1024, %c64, %c1] : !pto.tensor_view<1x1x1x16x64xf32> + %src_part = pto.partition_view %src_view, offsets = [%c0, %c0, %c0, %c0, %c0], sizes = [%c1, %c1, %c1, %c16, %c64] : !pto.tensor_view<1x1x1x16x64xf32> -> !pto.partition_tensor_view<1x1x1x16x64xf32> + %dst_part = pto.partition_view %dst_view, offsets = [%c0, %c0, %c0, %c0, %c0], sizes = [%c1, %c1, %c1, %c16, %c64] : !pto.tensor_view<1x1x1x16x64xf32> -> !pto.partition_tensor_view<1x1x1x16x64xf32> + %src = pto.alloc_tile : !pto.tile_buf + %dst = pto.alloc_tile : !pto.tile_buf + pto.tload ins(%src_part : !pto.partition_tensor_view<1x1x1x16x64xf32>) outs(%src : !pto.tile_buf) + pto.tdivs ins(%src, %scalar : !pto.tile_buf, f32) outs(%dst : !pto.tile_buf) {precisionType = #pto} + pto.tstore ins(%dst : !pto.tile_buf) outs(%dst_part : !pto.partition_tensor_view<1x1x1x16x64xf32>) + return + } + + // Case 13: f16 16x64 HP overflow + + func.func @TDIVS_f16_16x64_hp_subnormal_scalar_src(%src_ptr: !pto.ptr, %dst_ptr: !pto.ptr, %scalar: f16) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c16 = arith.constant 16 : index + %c64 = arith.constant 64 : index + %c1024 = arith.constant 1024 : index + %src_view = pto.make_tensor_view %src_ptr, shape = [%c1, %c1, %c1, %c16, %c64], strides = [%c1024, %c1024, %c1024, %c64, %c1] : !pto.tensor_view<1x1x1x16x64xf16> + %dst_view = pto.make_tensor_view %dst_ptr, shape = [%c1, %c1, %c1, %c16, %c64], strides = [%c1024, %c1024, %c1024, %c64, %c1] : !pto.tensor_view<1x1x1x16x64xf16> + %src_part = pto.partition_view %src_view, offsets = [%c0, %c0, %c0, %c0, %c0], sizes = [%c1, %c1, %c1, %c16, %c64] : !pto.tensor_view<1x1x1x16x64xf16> -> !pto.partition_tensor_view<1x1x1x16x64xf16> + %dst_part = pto.partition_view %dst_view, offsets = [%c0, %c0, %c0, %c0, %c0], sizes = [%c1, %c1, %c1, %c16, %c64] : !pto.tensor_view<1x1x1x16x64xf16> -> !pto.partition_tensor_view<1x1x1x16x64xf16> + %src = pto.alloc_tile : !pto.tile_buf + %dst = pto.alloc_tile : !pto.tile_buf + pto.tload ins(%src_part : !pto.partition_tensor_view<1x1x1x16x64xf16>) outs(%src : !pto.tile_buf) + pto.tdivs ins(%scalar, %src : f16, !pto.tile_buf) outs(%dst : !pto.tile_buf) {precisionType = #pto} + pto.tstore ins(%dst : !pto.tile_buf) outs(%dst_part : !pto.partition_tensor_view<1x1x1x16x64xf16>) + return + } + + // Case 18: f32 16x64 HP overflow scalar/src +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/texp/CMakeLists.txt b/test/tilelang_st/npu/a5/src/st/smoke/testcase/texp/CMakeLists.txt new file mode 100644 index 000000000..6ce5def10 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/texp/CMakeLists.txt @@ -0,0 +1,9 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +pto_tilelang_vec_st(texp) diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/texp/cases.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/texp/cases.py new file mode 100644 index 000000000..200f4a7d0 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/texp/cases.py @@ -0,0 +1,84 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +"""Single source of truth for texp ST test cases. + +Each case defines: + - name: case identifier, used as subdirectory name and by main.cpp kCases[]. + - dtype: numpy dtype (e.g. np.float32). + - shape: (rows, cols) — allocated tile dimensions. + - valid_shape: (valid_rows, valid_cols) — effective computation region. + - eps: tolerance for numpy.allclose (atol and rtol). + - high_precision: bool — when True, restricts input range to test ExpPrecisionImpl. + Uses subnormal threshold (0x007FFFFF for f32, 0x03FF for f16). + +gen_data.py and compare.py both import this list to avoid redundant definitions. +""" + +import numpy as np + +CASES = [ + { + "name": "f32_16x64", + "dtype": np.float32, + "shape": (16, 64), + "valid_shape": (16, 64), + "eps": 1e-6, + "high_precision": False, + }, + { + "name": "f32_32x32", + "dtype": np.float32, + "shape": (32, 32), + "valid_shape": (32, 32), + "eps": 1e-6, + "high_precision": False, + }, + { + "name": "f16_16x64", + "dtype": np.float16, + "shape": (16, 64), + "valid_shape": (16, 64), + "eps": 1e-3, + "high_precision": False, + }, + { + "name": "f16_32x32", + "dtype": np.float16, + "shape": (32, 32), + "valid_shape": (32, 32), + "eps": 1e-3, + "high_precision": False, + }, + { + "name": "f32_64x64_hp1", + "dtype": np.float32, + "shape": (64, 64), + "valid_shape": (64, 64), + "eps": 1e-7, + "high_precision": True, + }, + { + "name": "f16_64x64_hp2", + "dtype": np.float16, + "shape": (64, 64), + "valid_shape": (64, 64), + "eps": 1e-7, + "high_precision": True, + }, +] + +_SMOKE_CASE_NAMES = ['f32_16x64', 'f16_16x64'] +_SMOKE_CASE_NAME_SET = set(_SMOKE_CASE_NAMES) +_missing = [name for name in _SMOKE_CASE_NAMES if name not in {case["name"] for case in CASES}] +if _missing: + raise RuntimeError("unknown smoke case(s): " + ", ".join(_missing)) +CASES = [case for case in CASES if case["name"] in _SMOKE_CASE_NAME_SET] diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/texp/compare.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/texp/compare.py new file mode 100644 index 000000000..428604929 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/texp/compare.py @@ -0,0 +1,49 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import os +import sys +import numpy as np + +from cases import CASES +from st_common import result_cmp, style_fail, style_pass, validate_cases + + +def main(): + validate_cases(CASES) + case_filter = sys.argv[1] if len(sys.argv) > 1 else None + + all_passed = True + for case in CASES: + if case_filter is not None and case["name"] != case_filter: + continue + + case_dir = case["name"] + shape = case["shape"] + vr, vc = case["valid_shape"] + + golden = np.fromfile(os.path.join(case_dir, "golden.bin"), dtype=case["dtype"]).reshape(shape) + output = np.fromfile(os.path.join(case_dir, "output.bin"), dtype=case["dtype"]).reshape(shape) + + ok = result_cmp(golden[:vr, :vc], output[:vr, :vc], case["eps"]) + if ok: + print(style_pass(f"[INFO] {case['name']}: compare passed")) + else: + print(style_fail(f"[ERROR] {case['name']}: compare failed")) + all_passed = False + + if not all_passed: + sys.exit(2) + print(style_pass("[INFO] all cases passed")) + + +if __name__ == "__main__": + main() diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/texp/gen_data.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/texp/gen_data.py new file mode 100644 index 000000000..13103c495 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/texp/gen_data.py @@ -0,0 +1,42 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import numpy as np +import struct +import math +from cases import CASES +from st_common import validate_cases, setup_case_rng, save_case_data + +validate_cases(CASES) + +for case in CASES: + setup_case_rng(case) + + dtype = case["dtype"] + shape = case["shape"] + valid_shape = case["valid_shape"] + high_precision = case["high_precision"] + + if high_precision: + hex_threshold = '007FFFFF' + bound_val = struct.unpack('!f', bytes.fromhex(hex_threshold))[0] + max_val = math.log(bound_val) + min_val = max_val * 2 + input = np.random.uniform(min_val, max_val, size=shape).astype(dtype) + else: + input = np.random.randn(*shape).astype(dtype) + + golden = np.zeros(shape, dtype=dtype) + vr, vc = valid_shape + golden[:vr, :vc] = np.exp(input[:vr, :vc]).astype(dtype, copy=False) + + save_case_data(case["name"], {"input": input, "golden": golden}) + print(f"[INFO] gen_data: {case['name']} shape={shape} valid_shape={valid_shape} dtype={dtype.__name__} high_precision={high_precision}") diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/texp/launch.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/texp/launch.cpp new file mode 100644 index 000000000..e8ef5504c --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/texp/launch.cpp @@ -0,0 +1,28 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +#include + +#ifndef AICORE +#define AICORE [aicore] +#endif + +// Case 0: f32 16x64 + +extern "C" __global__ AICORE void TEXP_f32_16x64(__gm__ float *a, __gm__ float *b); +extern "C" __global__ AICORE void TEXP_f16_16x64(__gm__ uint16_t *a, __gm__ uint16_t *b); + +void LaunchTEXP_f16_16x64(void *a, void *b, void *stream) { + TEXP_f16_16x64<<<1, nullptr, stream>>>((__gm__ uint16_t *)a, (__gm__ uint16_t *)b); +} + + + +void LaunchTEXP_f32_16x64(void *a, void *b, void *stream) { + TEXP_f32_16x64<<<1, nullptr, stream>>>((__gm__ float *)a, (__gm__ float *)b); +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/texp/main.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/texp/main.cpp new file mode 100644 index 000000000..7d3e45044 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/texp/main.cpp @@ -0,0 +1,136 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// Host driver for TileLang texp ST — case-table driven. +// Each case launches a different kernel variant, reads/writes from per-case subdirectory. +// Numerical comparison is done externally by compare.py. + +#include "acl/acl.h" +#include "test_common.h" +#include +#include +#include +#include +#include +#include + +using namespace PtoTestCommon; + +// Kernel launch wrappers (defined in launch.cpp) +void LaunchTEXP_f32_16x64(void *a, void *b, void *stream); +void LaunchTEXP_f32_32x32(void *a, void *b, void *stream); +void LaunchTEXP_f16_16x64(void *a, void *b, void *stream); +void LaunchTEXP_f16_32x32(void *a, void *b, void *stream); +void LaunchTEXP_f16_64x64_hp2(void *a, void *b, void *stream); + +using LaunchFn = void (*)(void *, void *, void *); + +struct TestCase { + const char *name; + LaunchFn launch; + size_t rows; // allocated tile rows + size_t cols; // allocated tile cols + size_t validRows; // effective computation rows (<= rows) + size_t validCols; // effective computation cols (<= cols) + size_t elemSize; // bytes per element +}; + +static const TestCase kCases[] = { +{"f32_16x64", LaunchTEXP_f32_16x64, 16, 64, 16, 64, sizeof(float)}, +{"f16_16x64", LaunchTEXP_f16_16x64, 16, 64, 16, 64, sizeof(uint16_t)}, +}; +static constexpr size_t kNumCases = sizeof(kCases) / sizeof(kCases[0]); + +static int RunCase(const TestCase &tc, int deviceId, aclrtStream stream) { + int rc = 0; + const size_t elemCount = tc.rows * tc.cols; + const size_t fileSize = elemCount * tc.elemSize; + + std::printf("[INFO] === case: %s (shape=%zux%zu, valid=%zux%zu) ===\n", + tc.name, tc.rows, tc.cols, tc.validRows, tc.validCols); + + // Per-case data directory + std::string caseDir = std::string("./") + tc.name; + size_t srcFileSize = fileSize; + + void *srcHost = nullptr, *dstHost = nullptr; + void *srcDevice = nullptr, *dstDevice = nullptr; + + aclrtMallocHost((void **)(&srcHost), fileSize); + aclrtMallocHost((void **)(&dstHost), fileSize); + + aclrtMalloc((void **)&srcDevice, fileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc((void **)&dstDevice, fileSize, ACL_MEM_MALLOC_HUGE_FIRST); + + if (!ReadFile((caseDir + "/input.bin").c_str(), srcFileSize, srcHost, fileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input.bin\n", caseDir.c_str()); + rc = 1; + } + + if (rc == 0) { + aclrtMemcpy(srcDevice, fileSize, srcHost, fileSize, ACL_MEMCPY_HOST_TO_DEVICE); + + tc.launch(srcDevice, dstDevice, stream); + + aclrtSynchronizeStream(stream); + aclrtMemcpy(dstHost, fileSize, dstDevice, fileSize, ACL_MEMCPY_DEVICE_TO_HOST); + } + + if (rc == 0 && !WriteFile((caseDir + "/output.bin").c_str(), dstHost, fileSize)) { + std::fprintf(stderr, "[ERROR] failed to write %s/output.bin\n", caseDir.c_str()); + rc = 1; + } + + if (srcDevice != nullptr) + aclrtFree(srcDevice); + if (dstDevice != nullptr) + aclrtFree(dstDevice); + if (srcHost != nullptr) + aclrtFreeHost(srcHost); + if (dstHost != nullptr) + aclrtFreeHost(dstHost); + + if (rc == 0) + std::printf("[INFO] case %s done\n", tc.name); + return rc; +} + +int main(int argc, char *argv[]) { + // Optional case filter: ./texp [case_name] + const char *caseFilter = (argc > 1) ? argv[1] : nullptr; + + int rc = 0; + int deviceId = 0; + aclrtStream stream = nullptr; + + aclInit(nullptr); + if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { + deviceId = std::atoi(envDevice); + } + aclrtSetDevice(deviceId); + aclrtCreateStream(&stream); + + for (size_t i = 0; i < kNumCases; ++i) { + if (caseFilter != nullptr && std::strcmp(kCases[i].name, caseFilter) != 0) { + continue; + } + int ret = RunCase(kCases[i], deviceId, stream); + if (ret != 0) { + std::fprintf(stderr, "[ERROR] case %s failed\n", kCases[i].name); + rc = 1; + break; + } + } + + if (stream != nullptr) + aclrtDestroyStream(stream); + aclrtResetDevice(deviceId); + aclFinalize(); + + return rc; +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/texp/texp.pto b/test/tilelang_st/npu/a5/src/st/smoke/testcase/texp/texp.pto new file mode 100644 index 000000000..e23a65126 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/texp/texp.pto @@ -0,0 +1,101 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// TileLang ST kernels for pto.texp: tload(a) + texp(a)->b + tstore(b). +// Multiple cases with different shapes in a single module. +// Compiled by ptoas --enable-insert-sync --enable-tile-op-expand --pto-backend=vpto +// to produce a fatobj object. + +module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { + // Case 0: f32 16x64 (1024 elements) + func.func @TEXP_f32_16x64(%a_ptr: !pto.ptr, %b_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c16 = arith.constant 16 : index + %c64 = arith.constant 64 : index + %c1024 = arith.constant 1024 : index + + %a_view = pto.make_tensor_view %a_ptr, + shape = [%c1, %c1, %c1, %c16, %c64], + strides = [%c1024, %c1024, %c1024, %c64, %c1] + : !pto.tensor_view<1x1x1x16x64xf32> + %b_view = pto.make_tensor_view %b_ptr, + shape = [%c1, %c1, %c1, %c16, %c64], + strides = [%c1024, %c1024, %c1024, %c64, %c1] + : !pto.tensor_view<1x1x1x16x64xf32> + + %a_part = pto.partition_view %a_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c16, %c64] + : !pto.tensor_view<1x1x1x16x64xf32> -> !pto.partition_tensor_view<1x1x1x16x64xf32> + %b_part = pto.partition_view %b_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c16, %c64] + : !pto.tensor_view<1x1x1x16x64xf32> -> !pto.partition_tensor_view<1x1x1x16x64xf32> + + %a = pto.alloc_tile + : !pto.tile_buf + %b = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%a_part : !pto.partition_tensor_view<1x1x1x16x64xf32>) + outs(%a : !pto.tile_buf) + + pto.texp ins(%a : !pto.tile_buf) + outs(%b : !pto.tile_buf) + + pto.tstore ins(%b : !pto.tile_buf) + outs(%b_part : !pto.partition_tensor_view<1x1x1x16x64xf32>) + return + } + + // Case 1: f32 32x32 (1024 elements) + + func.func @TEXP_f16_16x64(%a_ptr: !pto.ptr, %b_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c16 = arith.constant 16 : index + %c64 = arith.constant 64 : index + %c1024 = arith.constant 1024 : index + + %a_view = pto.make_tensor_view %a_ptr, + shape = [%c1, %c1, %c1, %c16, %c64], + strides = [%c1024, %c1024, %c1024, %c64, %c1] + : !pto.tensor_view<1x1x1x16x64xf16> + %b_view = pto.make_tensor_view %b_ptr, + shape = [%c1, %c1, %c1, %c16, %c64], + strides = [%c1024, %c1024, %c1024, %c64, %c1] + : !pto.tensor_view<1x1x1x16x64xf16> + + %a_part = pto.partition_view %a_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c16, %c64] + : !pto.tensor_view<1x1x1x16x64xf16> -> !pto.partition_tensor_view<1x1x1x16x64xf16> + %b_part = pto.partition_view %b_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c16, %c64] + : !pto.tensor_view<1x1x1x16x64xf16> -> !pto.partition_tensor_view<1x1x1x16x64xf16> + + %a = pto.alloc_tile + : !pto.tile_buf + %b = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%a_part : !pto.partition_tensor_view<1x1x1x16x64xf16>) + outs(%a : !pto.tile_buf) + + pto.texp ins(%a : !pto.tile_buf) + outs(%b : !pto.tile_buf) + + pto.tstore ins(%b : !pto.tile_buf) + outs(%b_part : !pto.partition_tensor_view<1x1x1x16x64xf16>) + return + } + + // Case 3: f16 32x32 (1024 elements) +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/texpands/CMakeLists.txt b/test/tilelang_st/npu/a5/src/st/smoke/testcase/texpands/CMakeLists.txt new file mode 100644 index 000000000..07e95e283 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/texpands/CMakeLists.txt @@ -0,0 +1,9 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +pto_tilelang_vec_st(texpands) diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/texpands/cases.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/texpands/cases.py new file mode 100644 index 000000000..f08fb42a8 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/texpands/cases.py @@ -0,0 +1,131 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +"""Single source of truth for texpands ST test cases. + +Each case defines: + - name: case identifier, used as subdirectory name and by main.cpp kCases[]. + - dtype: numpy dtype (e.g. np.float32). + - shape: (rows, cols) — allocated tile dimensions. + - valid_shape: (valid_rows, valid_cols) — effective computation region. + - scalar: the scalar value to broadcast to the tile. + - eps: tolerance for numpy.allclose (atol and rtol). + +gen_data.py and compare.py both import this list to avoid redundant definitions. +""" + +import numpy as np + +CASES = [ + # ========== float32 cases ========== + # Full valid shape cases + { + "name": "f32_16x64_scalar5", + "dtype": np.float32, + "shape": (16, 64), + "valid_shape": (16, 64), + "scalar": 5.0, + "eps": 1e-6, + }, + { + "name": "f32_32x32_scalar3", + "dtype": np.float32, + "shape": (32, 32), + "valid_shape": (32, 32), + "scalar": 3.0, + "eps": 1e-6, + }, + { + "name": "f32_64x64_scalar2", + "dtype": np.float32, + "shape": (64, 64), + "valid_shape": (64, 64), + "scalar": 2.0, + "eps": 1e-6, + }, + # Partial valid shape cases + { + "name": "f32_16x64_partial", + "dtype": np.float32, + "shape": (16, 64), + "valid_shape": (12, 48), + "scalar": 7.0, + "eps": 1e-6, + }, + { + "name": "f32_64x64_valid_60x60", + "dtype": np.float32, + "shape": (64, 64), + "valid_shape": (60, 60), + "scalar": 42.0, + "eps": 1e-6, + }, + + # ========== int32 cases ========== + { + "name": "i32_64x64_scalar100", + "dtype": np.int32, + "shape": (64, 64), + "valid_shape": (64, 64), + "scalar": 100, + "eps": 0, # exact match for integers + }, + { + "name": "i32_64x64_valid_60x60", + "dtype": np.int32, + "shape": (64, 64), + "valid_shape": (60, 60), + "scalar": 99, + "eps": 0, + }, + + # ========== half (fp16) cases ========== + { + "name": "f16_64x64_scalar1_5", + "dtype": np.float16, + "shape": (64, 64), + "valid_shape": (64, 64), + "scalar": 1.5, + "eps": 1e-3, # fp16 has lower precision + }, + { + "name": "f16_2x4096_valid_1x3600", + "dtype": np.float16, + "shape": (2, 4096), + "valid_shape": (1, 3600), + "scalar": 2.5, + "eps": 1e-3, + }, + + # ========== int16 cases ========== + { + "name": "i16_64x64_scalar50", + "dtype": np.int16, + "shape": (64, 64), + "valid_shape": (64, 64), + "scalar": 50, + "eps": 0, + }, + { + "name": "i16_20x512_valid_16x200", + "dtype": np.int16, + "shape": (20, 512), + "valid_shape": (16, 200), + "scalar": 25, + "eps": 0, + }, +] + +_SMOKE_CASE_NAMES = ['f32_16x64_scalar5', 'f32_16x64_partial'] +_SMOKE_CASE_NAME_SET = set(_SMOKE_CASE_NAMES) +_missing = [name for name in _SMOKE_CASE_NAMES if name not in {case["name"] for case in CASES}] +if _missing: + raise RuntimeError("unknown smoke case(s): " + ", ".join(_missing)) +CASES = [case for case in CASES if case["name"] in _SMOKE_CASE_NAME_SET] diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/texpands/compare.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/texpands/compare.py new file mode 100644 index 000000000..1ca3025ae --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/texpands/compare.py @@ -0,0 +1,78 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +"""Compare output against golden for texpands test cases.""" + +import os +import sys +import numpy as np + +from cases import CASES + + +def main(): + case_filter = sys.argv[1] if len(sys.argv) > 1 else None + + all_passed = True + for case in CASES: + if case_filter is not None and case["name"] != case_filter: + continue + + case_dir = case["name"] + dtype = case["dtype"] + shape = case["shape"] + valid_shape = case["valid_shape"] + eps = case["eps"] + + vr, vc = valid_shape + + # Load golden and output + golden = np.fromfile(os.path.join(case_dir, "golden.bin"), dtype=dtype).reshape(shape) + output = np.fromfile(os.path.join(case_dir, "output.bin"), dtype=dtype).reshape(shape) + + # For integer types, eps=0 means exact match + # For float types, use np.allclose with eps + if eps == 0: + # Integer comparison - exact match + if not np.array_equal(golden[:vr, :vc], output[:vr, :vc]): + diff = golden[:vr, :vc] - output[:vr, :vc] + idx = int(np.argmax(np.abs(diff))) + print(f"[ERROR] {case['name']}: Mismatch at idx={idx} (golden={golden.flat[idx]}, output={output.flat[idx]})") + all_passed = False + else: + print(f"[INFO] {case['name']}: compare passed") + else: + # Float comparison - use allclose + # Convert to float64 for comparison (fp16 precision issues) + g = golden[:vr, :vc].astype(np.float64, copy=False) + o = output[:vr, :vc].astype(np.float64, copy=False) + + if g.shape != o.shape: + print(f"[ERROR] {case['name']}: Shape mismatch: golden {g.shape} vs output {o.shape}") + all_passed = False + continue + + if not np.allclose(g, o, atol=eps, rtol=eps, equal_nan=True): + abs_diff = np.abs(g - o) + idx = int(np.argmax(abs_diff)) + print(f"[ERROR] {case['name']}: Mismatch: max diff={float(abs_diff.flat[idx])} " + f"at idx={idx} (golden={g.flat[idx]}, output={o.flat[idx]})") + all_passed = False + else: + print(f"[INFO] {case['name']}: compare passed") + + if not all_passed: + sys.exit(2) + print("[INFO] all cases passed") + + +if __name__ == "__main__": + main() diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/texpands/gen_data.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/texpands/gen_data.py new file mode 100644 index 000000000..99c81bfec --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/texpands/gen_data.py @@ -0,0 +1,49 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +"""Generate golden data for texpands test cases.""" + +import os +import numpy as np + +from cases import CASES + + +def setup_case_rng(case): + """Set a per-case deterministic random seed.""" + np.random.seed(hash(case["name"]) & 0xFFFFFFFF) + + +def save_case_data(case_name, data_dict): + """Create case directory and write {name}.bin for each entry.""" + os.makedirs(case_name, exist_ok=True) + for name, arr in data_dict.items(): + arr.tofile(os.path.join(case_name, f"{name}.bin")) + + +for case in CASES: + setup_case_rng(case) + + dtype = case["dtype"] + shape = case["shape"] + valid_shape = case["valid_shape"] + scalar = case["scalar"] + + # Convert scalar to the correct dtype + scalar_val = dtype(scalar) + + # Generate golden: fill valid_shape region with scalar value + golden = np.zeros(shape, dtype=dtype) + vr, vc = valid_shape + golden[:vr, :vc] = scalar_val + + save_case_data(case["name"], {"golden": golden}) + print(f"[INFO] gen_data: {case['name']} shape={shape} valid_shape={valid_shape} scalar={scalar} dtype={dtype.__name__}") diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/texpands/launch.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/texpands/launch.cpp new file mode 100644 index 000000000..0bbbb05c6 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/texpands/launch.cpp @@ -0,0 +1,28 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +#include + +#ifndef AICORE +#define AICORE [aicore] +#endif + +// ========== float32 kernels ========== + +extern "C" __global__ AICORE void TEXPANDS_f32_16x64_scalar5(__gm__ float *dst); +extern "C" __global__ AICORE void TEXPANDS_f32_16x64_partial(__gm__ float *dst); + +void LaunchTEXPANDS_f32_16x64_scalar5(float *dst, void *stream) { + TEXPANDS_f32_16x64_scalar5<<<1, nullptr, stream>>>((__gm__ float *)dst); +} + + + +void LaunchTEXPANDS_f32_16x64_partial(float *dst, void *stream) { + TEXPANDS_f32_16x64_partial<<<1, nullptr, stream>>>((__gm__ float *)dst); +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/texpands/main.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/texpands/main.cpp new file mode 100644 index 000000000..d6c566c6d --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/texpands/main.cpp @@ -0,0 +1,133 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// Host driver for TileLang texpands ST — case-table driven. +// Each case launches a different kernel variant, writes from per-case subdirectory. +// Numerical comparison is done externally by compare.py. + +#include "acl/acl.h" +#include "test_common.h" +#include +#include +#include +#include +#include +#include + +using namespace PtoTestCommon; + +// Kernel launch wrappers (defined in launch.cpp) +void LaunchTEXPANDS_f32_16x64_scalar5(float *dst, void *stream); +void LaunchTEXPANDS_f32_32x32_scalar3(float *dst, void *stream); +void LaunchTEXPANDS_f32_16x64_partial(float *dst, void *stream); +void LaunchTEXPANDS_i32_64x64_scalar100(int32_t *dst, void *stream); +void LaunchTEXPANDS_f16_64x64_scalar1_5(uint16_t *dst, void *stream); +void LaunchTEXPANDS_i16_64x64_scalar50(int16_t *dst, void *stream); + +enum class DataType { F32, I32, F16, I16 }; + +struct TestCase { + const char *name; + DataType dtype; + void (*launch)(void *, void *); // Generic launch function pointer + size_t rows; // allocated tile rows + size_t cols; // allocated tile cols + size_t validRows; // effective computation rows (<= rows) + size_t validCols; // effective computation cols (<= cols) + size_t elemSize; // bytes per element +}; + +// Helper to wrap type-specific launch functions +template +void wrapLaunch(void *dst, void *stream, void (*fn)(T *, void *)) { + fn((T *)dst, stream); +} + +static const TestCase kCases[] = { + // ========== float32 cases ========== +{"f32_16x64_scalar5", DataType::F32, + [](void *dst, void *stream) { wrapLaunch(dst, stream, LaunchTEXPANDS_f32_16x64_scalar5); }, + 16, 64, 16, 64, sizeof(float)}, +{"f32_16x64_partial", DataType::F32, + [](void *dst, void *stream) { wrapLaunch(dst, stream, LaunchTEXPANDS_f32_16x64_partial); }, + 16, 64, 12, 48, sizeof(float)}, +}; +static constexpr size_t kNumCases = sizeof(kCases) / sizeof(kCases[0]); + +static int RunCase(const TestCase &tc, int deviceId, aclrtStream stream) { + int rc = 0; + const size_t elemCount = tc.rows * tc.cols; + const size_t fileSize = elemCount * tc.elemSize; + + std::printf("[INFO] === case: %s (shape=%zux%zu, valid=%zux%zu) ===\n", + tc.name, tc.rows, tc.cols, tc.validRows, tc.validCols); + + // Per-case data directory + std::string caseDir = std::string("./") + tc.name; + + void *dstHost = nullptr; + void *dstDevice = nullptr; + + aclrtMallocHost(&dstHost, fileSize); + aclrtMalloc(&dstDevice, fileSize, ACL_MEM_MALLOC_HUGE_FIRST); + + // Launch kernel (scalar is hardcoded in .pto) + tc.launch(dstDevice, stream); + + aclrtSynchronizeStream(stream); + aclrtMemcpy(dstHost, fileSize, dstDevice, fileSize, ACL_MEMCPY_DEVICE_TO_HOST); + + if (rc == 0 && !WriteFile((caseDir + "/output.bin").c_str(), dstHost, fileSize)) { + std::fprintf(stderr, "[ERROR] failed to write %s/output.bin\n", caseDir.c_str()); + rc = 1; + } + + if (dstDevice != nullptr) + aclrtFree(dstDevice); + if (dstHost != nullptr) + aclrtFreeHost(dstHost); + + if (rc == 0) + std::printf("[INFO] case %s done\n", tc.name); + return rc; +} + +int main(int argc, char *argv[]) { + // Optional case filter: ./texpands [case_name] + const char *caseFilter = (argc > 1) ? argv[1] : nullptr; + + int rc = 0; + int deviceId = 0; + aclrtStream stream = nullptr; + + aclInit(nullptr); + if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { + deviceId = std::atoi(envDevice); + } + aclrtSetDevice(deviceId); + aclrtCreateStream(&stream); + + for (size_t i = 0; i < kNumCases; ++i) { + if (caseFilter != nullptr && std::strcmp(kCases[i].name, caseFilter) != 0) { + continue; + } + int ret = RunCase(kCases[i], deviceId, stream); + if (ret != 0) { + std::fprintf(stderr, "[ERROR] case %s failed\n", kCases[i].name); + rc = 1; + break; + } + } + + if (stream != nullptr) + aclrtDestroyStream(stream); + aclrtResetDevice(deviceId); + aclFinalize(); + + return rc; +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/texpands/texpands.pto b/test/tilelang_st/npu/a5/src/st/smoke/testcase/texpands/texpands.pto new file mode 100644 index 000000000..9d95df754 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/texpands/texpands.pto @@ -0,0 +1,80 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// TileLang ST kernels for pto.texpands: broadcast a scalar to a tile. +// Multiple cases with different shapes, data types, and scalar values. +// Compiled by ptoas --enable-insert-sync --enable-tile-op-expand --pto-backend=vpto + +module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { + // ========== float32 cases ========== + + // Case: f32 16x64, scalar=5.0 (full valid shape) + func.func @TEXPANDS_f32_16x64_scalar5(%dst_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c16 = arith.constant 16 : index + %c64 = arith.constant 64 : index + %c1024 = arith.constant 1024 : index + %scalar = arith.constant 5.0 : f32 + + %dst_view = pto.make_tensor_view %dst_ptr, + shape = [%c1, %c1, %c1, %c16, %c64], + strides = [%c1024, %c1024, %c1024, %c64, %c1] + : !pto.tensor_view<1x1x1x16x64xf32> + + %dst_part = pto.partition_view %dst_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c16, %c64] + : !pto.tensor_view<1x1x1x16x64xf32> -> !pto.partition_tensor_view<1x1x1x16x64xf32> + + %dst = pto.alloc_tile + : !pto.tile_buf + + pto.texpands ins(%scalar : f32) + outs(%dst : !pto.tile_buf) + + pto.tstore ins(%dst : !pto.tile_buf) + outs(%dst_part : !pto.partition_tensor_view<1x1x1x16x64xf32>) + return + } + + // Case: f32 32x32, scalar=3.0 (full valid shape) + + func.func @TEXPANDS_f32_16x64_partial(%dst_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c12 = arith.constant 12 : index + %c16 = arith.constant 16 : index + %c48 = arith.constant 48 : index + %c64 = arith.constant 64 : index + %c1024 = arith.constant 1024 : index + %scalar = arith.constant 7.0 : f32 + + %dst_view = pto.make_tensor_view %dst_ptr, + shape = [%c1, %c1, %c1, %c16, %c64], + strides = [%c1024, %c1024, %c1024, %c64, %c1] + : !pto.tensor_view<1x1x1x16x64xf32> + + %dst_part = pto.partition_view %dst_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c12, %c48] + : !pto.tensor_view<1x1x1x16x64xf32> -> !pto.partition_tensor_view<1x1x1x12x48xf32> + + %dst = pto.alloc_tile + : !pto.tile_buf + + pto.texpands ins(%scalar : f32) + outs(%dst : !pto.tile_buf) + + pto.tstore ins(%dst : !pto.tile_buf) + outs(%dst_part : !pto.partition_tensor_view<1x1x1x12x48xf32>) + return + } + + // Case: f32 64x64, valid 60x60, scalar=42.0 +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/textract/CMakeLists.txt b/test/tilelang_st/npu/a5/src/st/smoke/testcase/textract/CMakeLists.txt new file mode 100644 index 000000000..76a55a9d6 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/textract/CMakeLists.txt @@ -0,0 +1,9 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +pto_tilelang_cube_st(textract PTO_LEVEL level3) \ No newline at end of file diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/textract/cases.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/textract/cases.py new file mode 100644 index 000000000..e29599365 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/textract/cases.py @@ -0,0 +1,33 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import numpy as np + + +CASES = [ + { + "name": "mat2left_f16_16x16", + "dtype_src": np.float16, + "dtype_id": np.float16, + "shape_src": (16, 16), + "shape_id": (16, 16), + "shape_out": (16, 16), + "eps": 1e-2, + }, + { + "name": "mat2right_f16_16x16", + "dtype_src": np.float16, + "dtype_id": np.float16, + "shape_src": (16, 16), + "shape_id": (16, 16), + "shape_out": (16, 16), + "eps": 1e-2, + }, +] \ No newline at end of file diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/textract/compare.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/textract/compare.py new file mode 100644 index 000000000..0a4b11f0d --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/textract/compare.py @@ -0,0 +1,45 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import os +import sys +import numpy as np + +from cases import CASES +from st_common import result_cmp, style_fail, style_pass + + +def main(): + case_filter = sys.argv[1] if len(sys.argv) > 1 else None + + all_passed = True + for case in CASES: + if case_filter is not None and case["name"] != case_filter: + continue + + case_dir = case["name"] + shape_out = case["shape_out"] + golden = np.fromfile(os.path.join(case_dir, "golden.bin"), dtype=np.float32).reshape(shape_out) + output = np.fromfile(os.path.join(case_dir, "output.bin"), dtype=np.float32).reshape(shape_out) + + ok = result_cmp(golden, output, case["eps"]) + if ok: + print(style_pass(f"[INFO] {case['name']}: compare passed")) + else: + print(style_fail(f"[ERROR] {case['name']}: compare failed")) + all_passed = False + + if not all_passed: + sys.exit(2) + print(style_pass("[INFO] all cases passed")) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/textract/gen_data.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/textract/gen_data.py new file mode 100644 index 000000000..a47a74a94 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/textract/gen_data.py @@ -0,0 +1,36 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import numpy as np + +from cases import CASES +from st_common import setup_case_rng, save_case_data + + +import os + + +for case in CASES: + setup_case_rng(case) + name = case["name"] + + if name.startswith("mat2left"): + src = np.random.uniform(-1.0, 1.0, size=case["shape_src"]).astype(case["dtype_src"]) + id_mat = np.eye(case["shape_id"][0], case["shape_id"][1], dtype=case["dtype_id"]) + golden = np.matmul(src.astype(np.float32), id_mat.astype(np.float32)).astype(np.float32) + save_case_data(name, {"input1": src, "input2": id_mat, "golden": golden}) + + elif name.startswith("mat2right"): + id_mat = np.eye(case["shape_id"][0], case["shape_id"][1], dtype=case["dtype_id"]) + src = np.random.uniform(-1.0, 1.0, size=case["shape_src"]).astype(case["dtype_src"]) + golden = src.astype(np.float32).T.copy() + save_case_data(name, {"input1": id_mat, "input2": src, "golden": golden}) + + print(f"[INFO] gen_data: {name} done") diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/textract/launch.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/textract/launch.cpp new file mode 100644 index 000000000..4d9cdcd78 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/textract/launch.cpp @@ -0,0 +1,24 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +#include + +#ifndef AICORE +#define AICORE [aicore] +#endif + +extern "C" __global__ AICORE void TEXTRACT_M2L_f16_16x16(__gm__ uint16_t *src, __gm__ uint16_t *id, __gm__ float *out); +extern "C" __global__ AICORE void TEXTRACT_M2R_f16_16x16(__gm__ uint16_t *id, __gm__ uint16_t *src, __gm__ float *out); + +void LaunchTEXTRACT_M2L_f16_16x16(uint16_t *src, uint16_t *id, float *out, void *stream) { + TEXTRACT_M2L_f16_16x16<<<1, nullptr, stream>>>((__gm__ uint16_t *)src, (__gm__ uint16_t *)id, (__gm__ float *)out); +} + +void LaunchTEXTRACT_M2R_f16_16x16(uint16_t *id, uint16_t *src, float *out, void *stream) { + TEXTRACT_M2R_f16_16x16<<<1, nullptr, stream>>>((__gm__ uint16_t *)id, (__gm__ uint16_t *)src, (__gm__ float *)out); +} \ No newline at end of file diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/textract/main.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/textract/main.cpp new file mode 100644 index 000000000..0c8c6e54d --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/textract/main.cpp @@ -0,0 +1,147 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +#include "acl/acl.h" +#include "test_common.h" +#include +#include +#include +#include +#include + +using namespace PtoTestCommon; + +void LaunchTEXTRACT_M2L_f16_16x16(uint16_t *src, uint16_t *id, float *out, void *stream); +void LaunchTEXTRACT_M2R_f16_16x16(uint16_t *id, uint16_t *src, float *out, void *stream); + +using LaunchFn = void (*)(uint16_t *, uint16_t *, float *, void *); + +struct TestCase { + const char *name; + LaunchFn launch; + size_t input1Rows; + size_t input1Cols; + size_t input2Rows; + size_t input2Cols; + size_t outRows; + size_t outCols; +}; + +static const TestCase kCases[] = { + {"mat2left_f16_16x16", LaunchTEXTRACT_M2L_f16_16x16, 16, 16, 16, 16, 16, 16}, + {"mat2right_f16_16x16", LaunchTEXTRACT_M2R_f16_16x16, 16, 16, 16, 16, 16, 16}, +}; +static constexpr size_t kNumCases = sizeof(kCases) / sizeof(kCases[0]); + +static int RunCase(const TestCase &tc, int deviceId, aclrtStream stream) { + (void)deviceId; + int rc = 0; + const size_t i1Elems = tc.input1Rows * tc.input1Cols; + const size_t i2Elems = tc.input2Rows * tc.input2Cols; + const size_t outElems = tc.outRows * tc.outCols; + const size_t i1Bytes = i1Elems * sizeof(uint16_t); + const size_t i2Bytes = i2Elems * sizeof(uint16_t); + const size_t outBytes = outElems * sizeof(float); + size_t i1FileSize = i1Bytes; + size_t i2FileSize = i2Bytes; + + std::printf( + "[INFO] === case: %s (i1=%zux%zu, i2=%zux%zu, out=%zux%zu) ===\n", + tc.name, tc.input1Rows, tc.input1Cols, tc.input2Rows, tc.input2Cols, tc.outRows, tc.outCols + ); + + std::string caseDir = std::string("./") + tc.name; + + void *i1Host = nullptr; + void *i2Host = nullptr; + void *outHost = nullptr; + void *i1Device = nullptr; + void *i2Device = nullptr; + void *outDevice = nullptr; + + aclrtMallocHost(&i1Host, i1Bytes); + aclrtMallocHost(&i2Host, i2Bytes); + aclrtMallocHost(&outHost, outBytes); + + aclrtMalloc(&i1Device, i1Bytes, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc(&i2Device, i2Bytes, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc(&outDevice, outBytes, ACL_MEM_MALLOC_HUGE_FIRST); + + if (!ReadFile((caseDir + "/input1.bin").c_str(), i1FileSize, i1Host, i1Bytes)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input1.bin\n", caseDir.c_str()); + rc = 1; + } + if (rc == 0 && !ReadFile((caseDir + "/input2.bin").c_str(), i2FileSize, i2Host, i2Bytes)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input2.bin\n", caseDir.c_str()); + rc = 1; + } + + if (rc == 0) { + aclrtMemcpy(i1Device, i1Bytes, i1Host, i1Bytes, ACL_MEMCPY_HOST_TO_DEVICE); + aclrtMemcpy(i2Device, i2Bytes, i2Host, i2Bytes, ACL_MEMCPY_HOST_TO_DEVICE); + + tc.launch( + static_cast(i1Device), + static_cast(i2Device), + static_cast(outDevice), + stream + ); + + aclrtSynchronizeStream(stream); + aclrtMemcpy(outHost, outBytes, outDevice, outBytes, ACL_MEMCPY_DEVICE_TO_HOST); + } + + if (rc == 0 && !WriteFile((caseDir + "/output.bin").c_str(), outHost, outBytes)) { + std::fprintf(stderr, "[ERROR] failed to write %s/output.bin\n", caseDir.c_str()); + rc = 1; + } + + if (i1Device != nullptr) aclrtFree(i1Device); + if (i2Device != nullptr) aclrtFree(i2Device); + if (outDevice != nullptr) aclrtFree(outDevice); + if (i1Host != nullptr) aclrtFreeHost(i1Host); + if (i2Host != nullptr) aclrtFreeHost(i2Host); + if (outHost != nullptr) aclrtFreeHost(outHost); + + if (rc == 0) + std::printf("[INFO] case %s done\n", tc.name); + return rc; +} + +int main(int argc, char *argv[]) { + const char *caseFilter = (argc > 1) ? argv[1] : nullptr; + + int rc = 0; + int deviceId = 0; + aclrtStream stream = nullptr; + + aclInit(nullptr); + if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { + deviceId = std::atoi(envDevice); + } + aclrtSetDevice(deviceId); + aclrtCreateStream(&stream); + + for (size_t i = 0; i < kNumCases; ++i) { + if (caseFilter != nullptr && std::strcmp(kCases[i].name, caseFilter) != 0) { + continue; + } + int ret = RunCase(kCases[i], deviceId, stream); + if (ret != 0) { + std::fprintf(stderr, "[ERROR] case %s failed\n", kCases[i].name); + rc = 1; + break; + } + } + + if (stream != nullptr) aclrtDestroyStream(stream); + aclrtResetDevice(deviceId); + aclFinalize(); + + return rc; +} \ No newline at end of file diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/textract/textract.pto b/test/tilelang_st/npu/a5/src/st/smoke/testcase/textract/textract.pto new file mode 100644 index 000000000..ba749f84d --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/textract/textract.pto @@ -0,0 +1,176 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// TileLang ST kernels for textract (Mat->Left, Mat->Right) paths. +// +// Uses pto.textract for the tested path and mte surfaces for the readback path. +// Golden = src (because id x src = src for M2R, src x id = src for M2L). + +module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { + + // ---- Mat->Left: textract(src_mat -> left), mte(id_mat -> right), tmatmul -> acc -> GM ---- + func.func @TEXTRACT_M2L_f16_16x16(%src_gm: !pto.ptr, %id_gm: !pto.ptr, %out_gm: !pto.ptr) attributes {pto.aicore} { + %c0_i64 = arith.constant 0 : i64 + %c1_i64 = arith.constant 1 : i64 + %c16_i64 = arith.constant 16 : i64 + %c32_i64 = arith.constant 32 : i64 + %c512_i64 = arith.constant 512 : i64 + %c0_index = arith.constant 0 : index + %false = arith.constant false + + %src_tile = pto.alloc_tile addr = %c0_i64 + : !pto.tile_buf + %id_tile = pto.alloc_tile addr = %c512_i64 + : !pto.tile_buf + %left_tile = pto.alloc_tile addr = %c0_i64 + : !pto.tile_buf + %right_tile = pto.alloc_tile addr = %c0_i64 + : !pto.tile_buf + %acc_tile = pto.alloc_tile addr = %c0_i64 + : !pto.tile_buf + + %src_l1 = pto.tile_buf_addr %src_tile + : !pto.tile_buf + -> !pto.ptr + %id_l1 = pto.tile_buf_addr %id_tile + : !pto.tile_buf + -> !pto.ptr + %right_ptr = pto.tile_buf_addr %right_tile + : !pto.tile_buf + -> !pto.ptr + %acc_ptr = pto.tile_buf_addr %acc_tile + : !pto.tile_buf + -> !pto.ptr + + // GM -> L1: load src (textract target) + pto.mte_gm_l1_frac %src_gm, %src_l1, nd2nz, + shape(%c16_i64, %c16_i64), + src_layout(%c32_i64), + dst_group(%c1_i64, %c1_i64, %c16_i64, %c0_i64), + ctrl(%c0_i64, %false) + : !pto.ptr, !pto.ptr, nd2nz, + shape i64, i64, src_layout(i64), + dst_group i64, i64, i64, i64, ctrl i64, i1 + + // GM -> L1: load identity (readback operand) + pto.mte_gm_l1_frac %id_gm, %id_l1, nd2nz, + shape(%c16_i64, %c16_i64), + src_layout(%c32_i64), + dst_group(%c1_i64, %c1_i64, %c16_i64, %c0_i64), + ctrl(%c0_i64, %false) + : !pto.ptr, !pto.ptr, nd2nz, + shape i64, i64, src_layout(i64), + dst_group i64, i64, i64, i64, ctrl i64, i1 + pto.set_flag["PIPE_MTE2", "PIPE_MTE1", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_MTE1", "EVENT_ID0"] + + // textract Mat->Left: src_mat -> left (offset=0) + pto.textract ins(%src_tile, %c0_index, %c0_index : !pto.tile_buf, index, index) + outs(%left_tile : !pto.tile_buf) + + // mte L1->L0B: identity -> right (readback via proven mte surface) + pto.mte_l1_l0b %id_l1, %right_ptr, %c16_i64, %c16_i64, %c0_i64, %c0_i64 {transpose = true} + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE1", "PIPE_M", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE1", "PIPE_M", "EVENT_ID0"] + pto.tmatmul ins(%left_tile, %right_tile : !pto.tile_buf, + !pto.tile_buf) + outs(%acc_tile : !pto.tile_buf) + + pto.set_flag["PIPE_M", "PIPE_FIX", "EVENT_ID1"] + pto.wait_flag["PIPE_M", "PIPE_FIX", "EVENT_ID1"] + pto.mte_l0c_gm %acc_ptr, %out_gm, %c16_i64, %c16_i64, %c16_i64, %c16_i64, %c0_i64, %c0_i64, + nz2nd + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64, i64 + pto.barrier #pto.pipe + return + } + + // ---- Mat->Right: mte(id_mat -> left), textract(src_mat -> right), tmatmul -> acc -> GM ---- + func.func @TEXTRACT_M2R_f16_16x16(%id_gm: !pto.ptr, %src_gm: !pto.ptr, %out_gm: !pto.ptr) attributes {pto.aicore} { + %c0_i64 = arith.constant 0 : i64 + %c1_i64 = arith.constant 1 : i64 + %c16_i64 = arith.constant 16 : i64 + %c32_i64 = arith.constant 32 : i64 + %c512_i64 = arith.constant 512 : i64 + %c0_index = arith.constant 0 : index + %false = arith.constant false + + %id_tile = pto.alloc_tile addr = %c0_i64 + : !pto.tile_buf + %src_tile = pto.alloc_tile addr = %c512_i64 + : !pto.tile_buf + %left_tile = pto.alloc_tile addr = %c0_i64 + : !pto.tile_buf + %right_tile = pto.alloc_tile addr = %c0_i64 + : !pto.tile_buf + %acc_tile = pto.alloc_tile addr = %c0_i64 + : !pto.tile_buf + + %id_l1 = pto.tile_buf_addr %id_tile + : !pto.tile_buf + -> !pto.ptr + %src_l1 = pto.tile_buf_addr %src_tile + : !pto.tile_buf + -> !pto.ptr + %left_ptr = pto.tile_buf_addr %left_tile + : !pto.tile_buf + -> !pto.ptr + %acc_ptr = pto.tile_buf_addr %acc_tile + : !pto.tile_buf + -> !pto.ptr + + // GM -> L1: load identity (readback operand) + pto.mte_gm_l1_frac %id_gm, %id_l1, nd2nz, + shape(%c16_i64, %c16_i64), + src_layout(%c32_i64), + dst_group(%c1_i64, %c1_i64, %c16_i64, %c0_i64), + ctrl(%c0_i64, %false) + : !pto.ptr, !pto.ptr, nd2nz, + shape i64, i64, src_layout(i64), + dst_group i64, i64, i64, i64, ctrl i64, i1 + pto.set_flag["PIPE_MTE2", "PIPE_MTE1", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_MTE1", "EVENT_ID0"] + + // mte L1->L0A: identity -> left (readback via proven mte surface) + pto.mte_l1_l0a %id_l1, %left_ptr, %c16_i64, %c16_i64, %c0_i64, %c0_i64 + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + + // GM -> L1: load src (textract target) + pto.mte_gm_l1_frac %src_gm, %src_l1, nd2nz, + shape(%c16_i64, %c16_i64), + src_layout(%c32_i64), + dst_group(%c1_i64, %c1_i64, %c16_i64, %c0_i64), + ctrl(%c0_i64, %false) + : !pto.ptr, !pto.ptr, nd2nz, + shape i64, i64, src_layout(i64), + dst_group i64, i64, i64, i64, ctrl i64, i1 + pto.set_flag["PIPE_MTE2", "PIPE_MTE1", "EVENT_ID1"] + pto.wait_flag["PIPE_MTE2", "PIPE_MTE1", "EVENT_ID1"] + + // textract Mat->Right: src_mat -> right (offset=0) + pto.textract ins(%src_tile, %c0_index, %c0_index : !pto.tile_buf, index, index) + outs(%right_tile : !pto.tile_buf) + + pto.set_flag["PIPE_MTE1", "PIPE_M", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE1", "PIPE_M", "EVENT_ID0"] + pto.tmatmul ins(%left_tile, %right_tile : !pto.tile_buf, + !pto.tile_buf) + outs(%acc_tile : !pto.tile_buf) + + pto.set_flag["PIPE_M", "PIPE_FIX", "EVENT_ID1"] + pto.wait_flag["PIPE_M", "PIPE_FIX", "EVENT_ID1"] + pto.mte_l0c_gm %acc_ptr, %out_gm, %c16_i64, %c16_i64, %c16_i64, %c16_i64, %c0_i64, %c0_i64, + nz2nd + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64, i64 + pto.barrier #pto.pipe + return + } + + } \ No newline at end of file diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/textract_fp/CMakeLists.txt b/test/tilelang_st/npu/a5/src/st/smoke/testcase/textract_fp/CMakeLists.txt new file mode 100644 index 000000000..59e4cdc16 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/textract_fp/CMakeLists.txt @@ -0,0 +1,9 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +pto_tilelang_cube_st(textract_fp PTO_LEVEL level3) \ No newline at end of file diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/textract_fp/cases.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/textract_fp/cases.py new file mode 100644 index 000000000..24db5eb4e --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/textract_fp/cases.py @@ -0,0 +1,25 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import numpy as np + + +CASES = [ + { + "name": "fp_f32_f16_16x16", + "dtype_src": np.float16, + "dtype_scaling": np.float32, + "dtype_out": np.float32, + "shape_src": (16, 16), + "shape_scaling": (16, 16), + "shape_out": (16, 16), + "eps": 1e-2, + }, +] \ No newline at end of file diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/textract_fp/compare.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/textract_fp/compare.py new file mode 100644 index 000000000..12d3e78e4 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/textract_fp/compare.py @@ -0,0 +1,45 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import os +import sys +import numpy as np + +from cases import CASES +from st_common import result_cmp, style_fail, style_pass + + +def main(): + case_filter = sys.argv[1] if len(sys.argv) > 1 else None + + all_passed = True + for case in CASES: + if case_filter is not None and case["name"] != case_filter: + continue + + case_dir = case["name"] + shape_out = case["shape_out"] + golden = np.fromfile(os.path.join(case_dir, "golden.bin"), dtype=np.float32).reshape(shape_out) + output = np.fromfile(os.path.join(case_dir, "output.bin"), dtype=np.float32).reshape(shape_out) + + ok = result_cmp(golden, output, case["eps"]) + if ok: + print(style_pass(f"[INFO] {case['name']}: compare passed")) + else: + print(style_fail(f"[ERROR] {case['name']}: compare failed")) + all_passed = False + + if not all_passed: + sys.exit(2) + print(style_pass("[INFO] all cases passed")) + + +if __name__ == "__main__": + main() diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/textract_fp/gen_data.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/textract_fp/gen_data.py new file mode 100644 index 000000000..33c6cc911 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/textract_fp/gen_data.py @@ -0,0 +1,31 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import numpy as np + +from cases import CASES +from st_common import setup_case_rng, save_case_data + + +for case in CASES: + setup_case_rng(case) + name = case["name"] + + if name.startswith("fp"): + a = np.random.uniform(-1.0, 1.0, size=case["shape_src"]).astype(case["dtype_src"]) + b = np.random.uniform(-1.0, 1.0, size=case["shape_src"]).astype(case["dtype_src"]) + fb = np.ones(case["shape_scaling"], dtype=case["dtype_scaling"]) + id_mat = np.eye(case["shape_src"][0], dtype=case["dtype_src"]) + matmul_f32 = np.matmul(a.astype(np.float32), b.astype(np.float32)) + quantized_f16 = matmul_f32.astype(np.float16) + golden = np.matmul(quantized_f16.astype(np.float32), id_mat.astype(np.float32)) + save_case_data(name, {"input1": a, "input2": b, "input3": fb, "input4": id_mat, "golden": golden}) + + print(f"[INFO] gen_data: {name} done") diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/textract_fp/launch.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/textract_fp/launch.cpp new file mode 100644 index 000000000..b137e2fe2 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/textract_fp/launch.cpp @@ -0,0 +1,19 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +#include + +#ifndef AICORE +#define AICORE [aicore] +#endif + +extern "C" __global__ AICORE void TEXTRACT_FP_f32_f16_16x16(__gm__ uint16_t *a, __gm__ uint16_t *b, __gm__ float *fb, __gm__ uint16_t *id, __gm__ float *out); + +void LaunchTEXTRACT_FP_f32_f16_16x16(uint16_t *a, uint16_t *b, float *fb, uint16_t *id, float *out, void *stream) { + TEXTRACT_FP_f32_f16_16x16<<<1, nullptr, stream>>>((__gm__ uint16_t *)a, (__gm__ uint16_t *)b, (__gm__ float *)fb, (__gm__ uint16_t *)id, (__gm__ float *)out); +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/textract_fp/main.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/textract_fp/main.cpp new file mode 100644 index 000000000..6518ade85 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/textract_fp/main.cpp @@ -0,0 +1,179 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +#include "acl/acl.h" +#include "test_common.h" +#include +#include +#include +#include +#include + +using namespace PtoTestCommon; + +void LaunchTEXTRACT_FP_f32_f16_16x16(uint16_t *a, uint16_t *b, float *fb, uint16_t *id, float *out, void *stream); + +using LaunchFn = void (*)(uint16_t *, uint16_t *, float *, uint16_t *, float *, void *); + +struct TestCase { + const char *name; + LaunchFn launch; + size_t aRows; + size_t aCols; + size_t bRows; + size_t bCols; + size_t fbRows; + size_t fbCols; + size_t idRows; + size_t idCols; + size_t outRows; + size_t outCols; +}; + +static const TestCase kCases[] = { + {"fp_f32_f16_16x16", LaunchTEXTRACT_FP_f32_f16_16x16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16}, +}; +static constexpr size_t kNumCases = sizeof(kCases) / sizeof(kCases[0]); + +static int RunCase(const TestCase &tc, int deviceId, aclrtStream stream) { + (void)deviceId; + int rc = 0; + const size_t aElems = tc.aRows * tc.aCols; + const size_t bElems = tc.bRows * tc.bCols; + const size_t fbElems = tc.fbRows * tc.fbCols; + const size_t idElems = tc.idRows * tc.idCols; + const size_t outElems = tc.outRows * tc.outCols; + const size_t aBytes = aElems * sizeof(uint16_t); + const size_t bBytes = bElems * sizeof(uint16_t); + const size_t fbBytes = fbElems * sizeof(float); + const size_t idBytes = idElems * sizeof(uint16_t); + const size_t outBytes = outElems * sizeof(float); + size_t aFileSize = aBytes; + size_t bFileSize = bBytes; + size_t fbFileSize = fbBytes; + size_t idFileSize = idBytes; + + std::printf( + "[INFO] === case: %s (a=%zux%zu, b=%zux%zu, fb=%zux%zu, id=%zux%zu, out=%zux%zu) ===\n", + tc.name, tc.aRows, tc.aCols, tc.bRows, tc.bCols, tc.fbRows, tc.fbCols, tc.idRows, tc.idCols, tc.outRows, tc.outCols + ); + + std::string caseDir = std::string("./") + tc.name; + + void *aHost = nullptr; + void *bHost = nullptr; + void *fbHost = nullptr; + void *idHost = nullptr; + void *outHost = nullptr; + void *aDevice = nullptr; + void *bDevice = nullptr; + void *fbDevice = nullptr; + void *idDevice = nullptr; + void *outDevice = nullptr; + + aclrtMallocHost(&aHost, aBytes); + aclrtMallocHost(&bHost, bBytes); + aclrtMallocHost(&fbHost, fbBytes); + aclrtMallocHost(&idHost, idBytes); + aclrtMallocHost(&outHost, outBytes); + + aclrtMalloc(&aDevice, aBytes, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc(&bDevice, bBytes, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc(&fbDevice, fbBytes, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc(&idDevice, idBytes, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc(&outDevice, outBytes, ACL_MEM_MALLOC_HUGE_FIRST); + + if (!ReadFile((caseDir + "/input1.bin").c_str(), aFileSize, aHost, aBytes)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input1.bin\n", caseDir.c_str()); + rc = 1; + } + if (rc == 0 && !ReadFile((caseDir + "/input2.bin").c_str(), bFileSize, bHost, bBytes)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input2.bin\n", caseDir.c_str()); + rc = 1; + } + if (rc == 0 && !ReadFile((caseDir + "/input3.bin").c_str(), fbFileSize, fbHost, fbBytes)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input3.bin\n", caseDir.c_str()); + rc = 1; + } + if (rc == 0 && !ReadFile((caseDir + "/input4.bin").c_str(), idFileSize, idHost, idBytes)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input4.bin\n", caseDir.c_str()); + rc = 1; + } + + if (rc == 0) { + aclrtMemcpy(aDevice, aBytes, aHost, aBytes, ACL_MEMCPY_HOST_TO_DEVICE); + aclrtMemcpy(bDevice, bBytes, bHost, bBytes, ACL_MEMCPY_HOST_TO_DEVICE); + aclrtMemcpy(fbDevice, fbBytes, fbHost, fbBytes, ACL_MEMCPY_HOST_TO_DEVICE); + aclrtMemcpy(idDevice, idBytes, idHost, idBytes, ACL_MEMCPY_HOST_TO_DEVICE); + + tc.launch( + static_cast(aDevice), + static_cast(bDevice), + static_cast(fbDevice), + static_cast(idDevice), + static_cast(outDevice), + stream + ); + + aclrtSynchronizeStream(stream); + aclrtMemcpy(outHost, outBytes, outDevice, outBytes, ACL_MEMCPY_DEVICE_TO_HOST); + } + + if (rc == 0 && !WriteFile((caseDir + "/output.bin").c_str(), outHost, outBytes)) { + std::fprintf(stderr, "[ERROR] failed to write %s/output.bin\n", caseDir.c_str()); + rc = 1; + } + + if (aDevice != nullptr) aclrtFree(aDevice); + if (bDevice != nullptr) aclrtFree(bDevice); + if (fbDevice != nullptr) aclrtFree(fbDevice); + if (idDevice != nullptr) aclrtFree(idDevice); + if (outDevice != nullptr) aclrtFree(outDevice); + if (aHost != nullptr) aclrtFreeHost(aHost); + if (bHost != nullptr) aclrtFreeHost(bHost); + if (fbHost != nullptr) aclrtFreeHost(fbHost); + if (idHost != nullptr) aclrtFreeHost(idHost); + if (outHost != nullptr) aclrtFreeHost(outHost); + + if (rc == 0) + std::printf("[INFO] case %s done\n", tc.name); + return rc; +} + +int main(int argc, char *argv[]) { + const char *caseFilter = (argc > 1) ? argv[1] : nullptr; + + int rc = 0; + int deviceId = 0; + aclrtStream stream = nullptr; + + aclInit(nullptr); + if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { + deviceId = std::atoi(envDevice); + } + aclrtSetDevice(deviceId); + aclrtCreateStream(&stream); + + for (size_t i = 0; i < kNumCases; ++i) { + if (caseFilter != nullptr && std::strcmp(kCases[i].name, caseFilter) != 0) { + continue; + } + int ret = RunCase(kCases[i], deviceId, stream); + if (ret != 0) { + std::fprintf(stderr, "[ERROR] case %s failed\n", kCases[i].name); + rc = 1; + break; + } + } + + if (stream != nullptr) aclrtDestroyStream(stream); + aclrtResetDevice(deviceId); + aclFinalize(); + + return rc; +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/textract_fp/textract_fp.pto b/test/tilelang_st/npu/a5/src/st/smoke/testcase/textract_fp/textract_fp.pto new file mode 100644 index 000000000..7b21f4756 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/textract_fp/textract_fp.pto @@ -0,0 +1,182 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// TileLang ST kernel for textract_fp (Acc->Mat with pre_quant f32->f16). +// +// Pipeline: mte_gm_l1_frac(A,B) -> mte_l1_l0a/l0b -> tmatmul -> acc (L0C) +// mte_gm_l1_frac(fb) -> mte_l1_fb -> scaling +// textract_fp(acc, fp, 0, 0) -> dst_mat (L1, f16) +// Readback: dst_mat x identity -> acc2 -> GM (f32) + +module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { + + func.func @TEXTRACT_FP_f32_f16_16x16(%a_gm: !pto.ptr, %b_gm: !pto.ptr, %fb_gm: !pto.ptr, %id_gm: !pto.ptr, %out_gm: !pto.ptr) attributes {pto.aicore} { + %c0_i64 = arith.constant 0 : i64 + %c1_i64 = arith.constant 1 : i64 + %c16_i64 = arith.constant 16 : i64 + %c32_i64 = arith.constant 32 : i64 + %c64_i64 = arith.constant 64 : i64 + %c512_i64 = arith.constant 512 : i64 + %c1024_i64 = arith.constant 1024 : i64 + %c1536_i64 = arith.constant 1536 : i64 + %c0_index = arith.constant 0 : index + %false = arith.constant false + + // --- tile allocations --- + %a_mat = pto.alloc_tile addr = %c0_i64 + : !pto.tile_buf + %b_mat = pto.alloc_tile addr = %c512_i64 + : !pto.tile_buf + %fb_mat = pto.alloc_tile addr = %c1024_i64 + : !pto.tile_buf + %id_mat = pto.alloc_tile addr = %c1536_i64 + : !pto.tile_buf + + %left_tile = pto.alloc_tile addr = %c0_i64 + : !pto.tile_buf + %right_tile = pto.alloc_tile addr = %c0_i64 + : !pto.tile_buf + %acc_tile = pto.alloc_tile addr = %c0_i64 + : !pto.tile_buf + %fp_tile = pto.alloc_tile addr = %c0_i64 + : !pto.tile_buf + %dst_mat = pto.alloc_tile addr = %c0_i64 + : !pto.tile_buf + + // --- pointers --- + %a_l1 = pto.tile_buf_addr %a_mat + : !pto.tile_buf + -> !pto.ptr + %b_l1 = pto.tile_buf_addr %b_mat + : !pto.tile_buf + -> !pto.ptr + %fb_l1 = pto.tile_buf_addr %fb_mat + : !pto.tile_buf + -> !pto.ptr + %id_l1 = pto.tile_buf_addr %id_mat + : !pto.tile_buf + -> !pto.ptr + %left_ptr = pto.tile_buf_addr %left_tile + : !pto.tile_buf + -> !pto.ptr + %right_ptr = pto.tile_buf_addr %right_tile + : !pto.tile_buf + -> !pto.ptr + %acc_ptr = pto.tile_buf_addr %acc_tile + : !pto.tile_buf + -> !pto.ptr + %fp_ptr = pto.tile_buf_addr %fp_tile + : !pto.tile_buf + -> !pto.ptr + %dst_mat_ptr = pto.tile_buf_addr %dst_mat + : !pto.tile_buf + -> !pto.ptr + + // --- GM -> L1: load A, B --- + pto.mte_gm_l1_frac %a_gm, %a_l1, nd2nz, + shape(%c16_i64, %c16_i64), + src_layout(%c32_i64), + dst_group(%c1_i64, %c1_i64, %c16_i64, %c0_i64), + ctrl(%c0_i64, %false) + : !pto.ptr, !pto.ptr, nd2nz, + shape i64, i64, src_layout(i64), + dst_group i64, i64, i64, i64, ctrl i64, i1 + pto.mte_gm_l1_frac %b_gm, %b_l1, nd2nz, + shape(%c16_i64, %c16_i64), + src_layout(%c32_i64), + dst_group(%c1_i64, %c1_i64, %c16_i64, %c0_i64), + ctrl(%c0_i64, %false) + : !pto.ptr, !pto.ptr, nd2nz, + shape i64, i64, src_layout(i64), + dst_group i64, i64, i64, i64, ctrl i64, i1 + pto.set_flag["PIPE_MTE2", "PIPE_MTE1", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_MTE1", "EVENT_ID0"] + + // --- L1 -> L0A/L0B --- + pto.mte_l1_l0a %a_l1, %left_ptr, %c16_i64, %c16_i64, %c0_i64, %c0_i64 + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.mte_l1_l0b %b_l1, %right_ptr, %c16_i64, %c16_i64, %c0_i64, %c0_i64 {transpose = true} + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + + // --- tmatmul -> acc --- + pto.set_flag["PIPE_MTE1", "PIPE_M", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE1", "PIPE_M", "EVENT_ID0"] + pto.tmatmul ins(%left_tile, %right_tile : !pto.tile_buf, + !pto.tile_buf) + outs(%acc_tile : !pto.tile_buf) + pto.set_flag["PIPE_M", "PIPE_FIX", "EVENT_ID0"] + pto.wait_flag["PIPE_M", "PIPE_FIX", "EVENT_ID0"] + + // --- GM -> L1: load fb scaling params --- + pto.mte_gm_l1_frac %fb_gm, %fb_l1, nd2nz, + shape(%c16_i64, %c16_i64), + src_layout(%c64_i64), + dst_group(%c1_i64, %c1_i64, %c16_i64, %c0_i64), + ctrl(%c0_i64, %false) + : !pto.ptr, !pto.ptr, nd2nz, + shape i64, i64, src_layout(i64), + dst_group i64, i64, i64, i64, ctrl i64, i1 + pto.set_flag["PIPE_MTE2", "PIPE_FIX", "EVENT_ID1"] + pto.wait_flag["PIPE_MTE2", "PIPE_FIX", "EVENT_ID1"] + + // --- L1 -> fb: load scaling to Scaling buffer --- + pto.mte_l1_fb %fb_l1, %fp_ptr, %c16_i64 + nburst(%c1_i64, %c0_i64, %c0_i64) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + + // --- textract_fp: acc(f32) -> mat(f16) with pre_quant --- + pto.set_flag["PIPE_M", "PIPE_FIX", "EVENT_ID0"] + pto.wait_flag["PIPE_M", "PIPE_FIX", "EVENT_ID0"] + pto.textract_fp ins(%acc_tile, %fp_tile, %c0_index, %c0_index : + !pto.tile_buf, + !pto.tile_buf, index, index) + outs(%dst_mat : !pto.tile_buf) + + // --- Readback: textract_fp output x identity -> acc -> GM --- + + // FIX -> MTE1: ensure textract_fp completed before reading dst_mat + pto.set_flag["PIPE_FIX", "PIPE_MTE1", "EVENT_ID0"] + pto.wait_flag["PIPE_FIX", "PIPE_MTE1", "EVENT_ID0"] + + // GM -> L1: load identity matrix + pto.mte_gm_l1_frac %id_gm, %id_l1, nd2nz, + shape(%c16_i64, %c16_i64), + src_layout(%c32_i64), + dst_group(%c1_i64, %c1_i64, %c16_i64, %c0_i64), + ctrl(%c0_i64, %false) + : !pto.ptr, !pto.ptr, nd2nz, + shape i64, i64, src_layout(i64), + dst_group i64, i64, i64, i64, ctrl i64, i1 + pto.set_flag["PIPE_MTE2", "PIPE_MTE1", "EVENT_ID1"] + pto.wait_flag["PIPE_MTE2", "PIPE_MTE1", "EVENT_ID1"] + + // L1 -> L0A: textract_fp output -> left + pto.mte_l1_l0a %dst_mat_ptr, %left_ptr, %c16_i64, %c16_i64, %c0_i64, %c0_i64 + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + // L1 -> L0B: identity -> right + pto.mte_l1_l0b %id_l1, %right_ptr, %c16_i64, %c16_i64, %c0_i64, %c0_i64 {transpose = true} + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + + // readback tmatmul: dst_mat x identity -> acc + pto.set_flag["PIPE_MTE1", "PIPE_M", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE1", "PIPE_M", "EVENT_ID0"] + pto.tmatmul ins(%left_tile, %right_tile : !pto.tile_buf, + !pto.tile_buf) + outs(%acc_tile : !pto.tile_buf) + + // acc -> GM + pto.set_flag["PIPE_M", "PIPE_FIX", "EVENT_ID1"] + pto.wait_flag["PIPE_M", "PIPE_FIX", "EVENT_ID1"] + pto.mte_l0c_gm %acc_ptr, %out_gm, %c16_i64, %c16_i64, %c16_i64, %c16_i64, %c0_i64, %c0_i64, + nz2nd + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64, i64 + pto.barrier #pto.pipe + return + } + + } diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/textract_v2v/CMakeLists.txt b/test/tilelang_st/npu/a5/src/st/smoke/testcase/textract_v2v/CMakeLists.txt new file mode 100644 index 000000000..0e7ef24ae --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/textract_v2v/CMakeLists.txt @@ -0,0 +1,9 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +pto_tilelang_vec_st(textract_v2v PTO_LEVEL level3) \ No newline at end of file diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/textract_v2v/cases.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/textract_v2v/cases.py new file mode 100644 index 000000000..7355b2e12 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/textract_v2v/cases.py @@ -0,0 +1,23 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import numpy as np + + +CASES = [ + { + "name": "v2v_f32_16x16", + "dtype_src": np.float32, + "dtype_out": np.float32, + "shape_src": (16, 16), + "shape_out": (16, 16), + "eps": 1e-6, + }, +] \ No newline at end of file diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/textract_v2v/compare.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/textract_v2v/compare.py new file mode 100644 index 000000000..7c9b50d37 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/textract_v2v/compare.py @@ -0,0 +1,45 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import os +import sys +import numpy as np + +from cases import CASES +from st_common import result_cmp, style_fail, style_pass + + +def main(): + case_filter = sys.argv[1] if len(sys.argv) > 1 else None + + all_passed = True + for case in CASES: + if case_filter is not None and case["name"] != case_filter: + continue + + case_dir = case["name"] + shape_out = case["shape_out"] + golden = np.fromfile(os.path.join(case_dir, "golden.bin"), dtype=case["dtype_out"]).reshape(shape_out) + output = np.fromfile(os.path.join(case_dir, "output.bin"), dtype=case["dtype_out"]).reshape(shape_out) + + ok = result_cmp(golden, output, case["eps"]) + if ok: + print(style_pass(f"[INFO] {case['name']}: compare passed")) + else: + print(style_fail(f"[ERROR] {case['name']}: compare failed")) + all_passed = False + + if not all_passed: + sys.exit(2) + print(style_pass("[INFO] all cases passed")) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/textract_v2v/gen_data.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/textract_v2v/gen_data.py new file mode 100644 index 000000000..231dbba10 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/textract_v2v/gen_data.py @@ -0,0 +1,26 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import numpy as np + +from cases import CASES +from st_common import setup_case_rng, save_case_data + + +for case in CASES: + setup_case_rng(case) + name = case["name"] + + if name.startswith("v2v"): + src = np.random.uniform(-1.0, 1.0, size=case["shape_src"]).astype(case["dtype_src"]) + golden = src.copy() + save_case_data(name, {"input1": src, "golden": golden}) + + print(f"[INFO] gen_data: {name} done") \ No newline at end of file diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/textract_v2v/launch.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/textract_v2v/launch.cpp new file mode 100644 index 000000000..e5fbbbd35 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/textract_v2v/launch.cpp @@ -0,0 +1,19 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +#include + +#ifndef AICORE +#define AICORE [aicore] +#endif + +extern "C" __global__ AICORE void TEXTRACT_V2V_ND_f32_16x16(__gm__ float *src, __gm__ float *out); + +void LaunchTEXTRACT_V2V_ND_f32_16x16(float *src, float *out, void *stream) { + TEXTRACT_V2V_ND_f32_16x16<<<1, nullptr, stream>>>((__gm__ float *)src, (__gm__ float *)out); +} \ No newline at end of file diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/textract_v2v/main.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/textract_v2v/main.cpp new file mode 100644 index 000000000..6e1110296 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/textract_v2v/main.cpp @@ -0,0 +1,128 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +#include "acl/acl.h" +#include "test_common.h" +#include +#include +#include +#include +#include + +using namespace PtoTestCommon; + +void LaunchTEXTRACT_V2V_ND_f32_16x16(float *src, float *out, void *stream); + +using LaunchFn = void (*)(float *, float *, void *); + +struct TestCase { + const char *name; + LaunchFn launch; + size_t srcRows; + size_t srcCols; + size_t outRows; + size_t outCols; +}; + +static const TestCase kCases[] = { + {"v2v_f32_16x16", LaunchTEXTRACT_V2V_ND_f32_16x16, 16, 16, 16, 16}, +}; +static constexpr size_t kNumCases = sizeof(kCases) / sizeof(kCases[0]); + +static int RunCase(const TestCase &tc, int deviceId, aclrtStream stream) { + (void)deviceId; + int rc = 0; + const size_t srcElems = tc.srcRows * tc.srcCols; + const size_t outElems = tc.outRows * tc.outCols; + const size_t srcBytes = srcElems * sizeof(float); + const size_t outBytes = outElems * sizeof(float); + size_t srcFileSize = srcBytes; + + std::printf( + "[INFO] === case: %s (src=%zux%zu, out=%zux%zu) ===\n", + tc.name, tc.srcRows, tc.srcCols, tc.outRows, tc.outCols + ); + + std::string caseDir = std::string("./") + tc.name; + + void *srcHost = nullptr; + void *outHost = nullptr; + void *srcDevice = nullptr; + void *outDevice = nullptr; + + aclrtMallocHost(&srcHost, srcBytes); + aclrtMallocHost(&outHost, outBytes); + + aclrtMalloc(&srcDevice, srcBytes, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc(&outDevice, outBytes, ACL_MEM_MALLOC_HUGE_FIRST); + + if (!ReadFile((caseDir + "/input1.bin").c_str(), srcFileSize, srcHost, srcBytes)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input1.bin\n", caseDir.c_str()); + rc = 1; + } + + if (rc == 0) { + aclrtMemcpy(srcDevice, srcBytes, srcHost, srcBytes, ACL_MEMCPY_HOST_TO_DEVICE); + + tc.launch( + static_cast(srcDevice), + static_cast(outDevice), + stream + ); + + aclrtSynchronizeStream(stream); + aclrtMemcpy(outHost, outBytes, outDevice, outBytes, ACL_MEMCPY_DEVICE_TO_HOST); + } + + if (rc == 0 && !WriteFile((caseDir + "/output.bin").c_str(), outHost, outBytes)) { + std::fprintf(stderr, "[ERROR] failed to write %s/output.bin\n", caseDir.c_str()); + rc = 1; + } + + if (srcDevice != nullptr) aclrtFree(srcDevice); + if (outDevice != nullptr) aclrtFree(outDevice); + if (srcHost != nullptr) aclrtFreeHost(srcHost); + if (outHost != nullptr) aclrtFreeHost(outHost); + + if (rc == 0) + std::printf("[INFO] case %s done\n", tc.name); + return rc; +} + +int main(int argc, char *argv[]) { + const char *caseFilter = (argc > 1) ? argv[1] : nullptr; + + int rc = 0; + int deviceId = 0; + aclrtStream stream = nullptr; + + aclInit(nullptr); + if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { + deviceId = std::atoi(envDevice); + } + aclrtSetDevice(deviceId); + aclrtCreateStream(&stream); + + for (size_t i = 0; i < kNumCases; ++i) { + if (caseFilter != nullptr && std::strcmp(kCases[i].name, caseFilter) != 0) { + continue; + } + int ret = RunCase(kCases[i], deviceId, stream); + if (ret != 0) { + std::fprintf(stderr, "[ERROR] case %s failed\n", kCases[i].name); + rc = 1; + break; + } + } + + if (stream != nullptr) aclrtDestroyStream(stream); + aclrtResetDevice(deviceId); + aclFinalize(); + + return rc; +} \ No newline at end of file diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/textract_v2v/textract_v2v.pto b/test/tilelang_st/npu/a5/src/st/smoke/testcase/textract_v2v/textract_v2v.pto new file mode 100644 index 000000000..4c950713c --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/textract_v2v/textract_v2v.pto @@ -0,0 +1,59 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// TileLang ST kernel for textract Vec->Vec ND (UB sub-window copy). +// +// kernel_kind=vector (pure vec path, no cube needed). +// Pipeline: tload(src) → textract(src_vec→dst_vec) → tstore(dst) + +module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { + + func.func @TEXTRACT_V2V_ND_f32_16x16(%src_ptr: !pto.ptr, %dst_ptr: !pto.ptr) attributes {pto.aicore} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c16 = arith.constant 16 : index + %c256 = arith.constant 256 : index + %c0_i64 = arith.constant 0 : i64 + %c1024_i64 = arith.constant 1024 : i64 + + %src_view = pto.make_tensor_view %src_ptr, + shape = [%c1, %c1, %c1, %c16, %c16], + strides = [%c256, %c256, %c256, %c16, %c1] + : !pto.tensor_view<1x1x1x16x16xf32> + %dst_view = pto.make_tensor_view %dst_ptr, + shape = [%c1, %c1, %c1, %c16, %c16], + strides = [%c256, %c256, %c256, %c16, %c1] + : !pto.tensor_view<1x1x1x16x16xf32> + + %src_part = pto.partition_view %src_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c16, %c16] + : !pto.tensor_view<1x1x1x16x16xf32> -> !pto.partition_tensor_view<1x1x1x16x16xf32> + %dst_part = pto.partition_view %dst_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c16, %c16] + : !pto.tensor_view<1x1x1x16x16xf32> -> !pto.partition_tensor_view<1x1x1x16x16xf32> + + %src_vec = pto.alloc_tile addr = %c0_i64 + : !pto.tile_buf + %dst_vec = pto.alloc_tile addr = %c1024_i64 + : !pto.tile_buf + + pto.tload ins(%src_part : !pto.partition_tensor_view<1x1x1x16x16xf32>) + outs(%src_vec : !pto.tile_buf) + + pto.textract ins(%src_vec, %c0, %c0 : !pto.tile_buf, index, index) + outs(%dst_vec : !pto.tile_buf) + + pto.tstore ins(%dst_vec : !pto.tile_buf) + outs(%dst_part : !pto.partition_tensor_view<1x1x1x16x16xf32>) + + return + } + +} \ No newline at end of file diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tfillpad/CMakeLists.txt b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tfillpad/CMakeLists.txt new file mode 100644 index 000000000..596922fa3 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tfillpad/CMakeLists.txt @@ -0,0 +1,9 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +pto_tilelang_vec_st(tfillpad) diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tfillpad/cases.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tfillpad/cases.py new file mode 100644 index 000000000..0e04351dd --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tfillpad/cases.py @@ -0,0 +1,210 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +"""Single source of truth for tfillpad ST test cases. + +Matches C++ reference test cases exactly (Cases 1-13). + +PadValue semantics: + - Max: +inf for float, MAX for integers + - Min: -inf for float, MIN for integers + - Null: no fill (keep original value) + - Custom(-1.0f): -1.0f for float, -1 for integers + +Each case defines: + - name: case identifier (must match main.cpp kCases[] and launch.cpp) + - dtype: numpy dtype + - shape: (rows, cols) — dst tile physical dimensions + - valid_shape: (valid_rows, valid_cols) — dst valid region (output size) + - src_shape: (rows, cols) — src tile physical dimensions (optional, default=dst) + - src_valid_shape: (valid_rows, valid_cols) — src valid region (optional, default=dst_valid) + - load_padval: PadValue for TLOAD (fill invalid columns in src tile) + - fill_padval: PadValue for TFILLPAD (fill expansion region in dst) + - eps: tolerance for numpy.allclose +""" + +import numpy as np + +# PadValue enum values matching C++ definition +PADVAL_MAX = "Max" # +inf for float, MAX for integers +PADVAL_MIN = "Min" # -inf for float, MIN for integers +PADVAL_NULL = "Null" # no fill (keep original value, treated as 0 in golden) +PADVAL_ZERO = "Zero" # zero fill +PADVAL_NEG1 = "Neg1" # -1.0f for float, -1 for integers (Custom) + +CASES = [ + { + "name": "f32_64x16_pad_64x7", + "dtype": np.float32, + "shape": (64, 16), + "valid_shape": (64, 16), + "src_shape": (64, 7), + "src_valid_shape": (64, 7), + "load_padval": PADVAL_MIN, + "fill_padval": PADVAL_MAX, + "eps": 1e-6, + }, + # ========== Case 4: float, 260x7 -> 260x16, PadMin/Max ========== + # C++: runTFILLPAD + + { + "name": "f32_260x16_pad_260x7", + "dtype": np.float32, + "shape": (260, 16), # dst tile physical + "valid_shape": (260, 16), # dst valid (output size) + "src_shape": (260, 7), # src tile physical + "src_valid_shape": (260, 7), # src valid + "load_padval": PADVAL_MIN, # TLOAD: fill cols 8-15 with -inf (32B aligned tile) + "fill_padval": PADVAL_MAX, # TFILLPAD: no expansion needed + "eps": 1e-6, + }, + # ========== Case 1: float, 128x127 -> 128x128, PadMax ========== + # C++: runTFILLPAD + + { + "name": "f32_128x128_pad_128x127", + "dtype": np.float32, + "shape": (128, 128), # dst tile physical + "valid_shape": (128, 128), # dst valid (output size) + "src_shape": (128, 127), # src tile physical (127 cols, < dst 128) + "src_valid_shape": (128, 127), # src valid = full src + "load_padval": PADVAL_MAX, # TLOAD: fill col 127 with +inf + "fill_padval": PADVAL_MAX, # TFILLPAD: no expansion needed + "eps": 1e-6, + }, + + # ========== Case 2: float, 128x127 -> 128x160, PadMax ========== + # C++: runTFILLPAD + + { + "name": "f32_128x160_pad_128x127", + "dtype": np.float32, + "shape": (128, 160), # dst tile physical + "valid_shape": (128, 160), # dst valid (output size) + "src_shape": (128, 127), # src tile physical + "src_valid_shape": (128, 127), # src valid + "load_padval": PADVAL_MAX, # TLOAD: fill col 127 with +inf + "fill_padval": PADVAL_MAX, # TFILLPAD: fill cols 128-159 with +inf + "eps": 1e-6, + }, + + # ========== Case 3: float, 128x127 -> 128x160, LoadPad=Min, FillPad=Max ========== + # C++: runTFILLPAD + + { + "name": "f32_128x160_pad_128x127_v2", + "dtype": np.float32, + "shape": (128, 160), # dst tile physical + "valid_shape": (128, 160), # dst valid (output size) + "src_shape": (128, 127), # src tile physical + "src_valid_shape": (128, 127), # src valid + "load_padval": PADVAL_MIN, # TLOAD: fill col 127 with -inf + "fill_padval": PADVAL_MAX, # TFILLPAD: fill cols 128-159 with +inf + "eps": 1e-6, + }, + + # ========== Case 6: uint16, 260x7 -> 260x32, PadMin/Max ========== + # C++: runTFILLPAD + + { + "name": "u16_260x32_pad_260x7", + "dtype": np.uint16, + "shape": (260, 32), # dst tile physical + "valid_shape": (260, 32), # dst valid (output size) + "src_shape": (260, 7), # src tile physical + "src_valid_shape": (260, 7), # src valid + "load_padval": PADVAL_MIN, # TLOAD: fill cols 8-31 with MIN (uint16 0) + "fill_padval": PADVAL_MAX, # TFILLPAD: fill cols 8-31 with MAX (uint16 65535) + "eps": 0, + }, + + # ========== Case 7: int8, 260x7 -> 260x64, PadMin/Max ========== + # C++: runTFILLPAD + + { + "name": "s8_260x64_pad_260x7", + "dtype": np.int8, + "shape": (260, 64), # dst tile physical + "valid_shape": (260, 64), # dst valid (output size) + "src_shape": (260, 7), # src tile physical + "src_valid_shape": (260, 7), # src valid + "load_padval": PADVAL_MIN, # TLOAD: fill cols 8-63 with MIN (int8 -128) + "fill_padval": PADVAL_MAX, # TFILLPAD: no expansion needed + "eps": 0, + }, + + # ========== Case 10: int16, 260x7 -> 260x32, PadMin/Min ========== + # C++: runTFILLPAD + + { + "name": "s16_260x32_pad_260x7", + "dtype": np.int16, + "shape": (260, 32), # dst tile physical + "valid_shape": (260, 32), # dst valid (output size) + "src_shape": (260, 7), # src tile physical + "src_valid_shape": (260, 7), # src valid + "load_padval": PADVAL_MIN, # TLOAD: fill cols 8-31 with MIN (int16 -32768) + "fill_padval": PADVAL_MIN, # TFILLPAD: no expansion needed + "eps": 0, + }, + + # ========== Case 11: int32, 260x7 -> 260x32, PadMin/Min ========== + # C++: runTFILLPAD + + { + "name": "s32_260x32_pad_260x7", + "dtype": np.int32, + "shape": (260, 32), # dst tile physical + "valid_shape": (260, 32), # dst valid (output size) + "src_shape": (260, 7), # src tile physical + "src_valid_shape": (260, 7), # src valid + "load_padval": PADVAL_MIN, # TLOAD: fill cols 8-31 with MIN (int32 -2147483648) + "fill_padval": PADVAL_MIN, # TFILLPAD: no expansion needed + "eps": 0, + }, + + # ========== Case 12: float, 128x64 -> 128x128, LoadPad=Null, FillPad=Neg1 ========== + # C++: runTFILLPAD + + { + "name": "f32_128x128_pad_128x64_neg1", + "dtype": np.float32, + "shape": (128, 128), # dst tile physical + "valid_shape": (128, 128), # dst valid = full dst (output size) + "src_shape": (128, 64), # src tile physical (64 cols) + "src_valid_shape": (128, 64), # src valid = full src + "load_padval": PADVAL_NULL, # TLOAD: no fill (src cols 64 aligned to 32B) + "fill_padval": PADVAL_NEG1, # TFILLPAD: fill cols 64-127 with -1.0f + "eps": 1e-6, + }, + + # ========== Case 13: float, 128x127 -> 128x160, LoadPad=Neg1, FillPad=Neg1 ========== + # C++: runTFILLPAD + + { + "name": "f32_128x160_pad_128x127_neg1", + "dtype": np.float32, + "shape": (128, 160), # dst tile physical + "valid_shape": (128, 160), # dst valid = full dst (output size) - CHANGED! + "src_shape": (128, 127), # src tile physical (127 cols) + "src_valid_shape": (128, 127), # src valid = full src + "load_padval": PADVAL_NEG1, # TLOAD: fill col 127 with -1.0f (127 not 32B aligned) + "fill_padval": PADVAL_NEG1, # TFILLPAD: fill cols 128-159 with -1.0f + "eps": 1e-6, + }, +] + +_SMOKE_CASE_NAMES = ['f32_64x16_pad_64x7', 'f32_260x16_pad_260x7'] +_SMOKE_CASE_NAME_SET = set(_SMOKE_CASE_NAMES) +_missing = [name for name in _SMOKE_CASE_NAMES if name not in {case["name"] for case in CASES}] +if _missing: + raise RuntimeError("unknown smoke case(s): " + ", ".join(_missing)) +CASES = [case for case in CASES if case["name"] in _SMOKE_CASE_NAME_SET] diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tfillpad/compare.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tfillpad/compare.py new file mode 100644 index 000000000..70d68d303 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tfillpad/compare.py @@ -0,0 +1,81 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +"""Compare output against golden for tfillpad test cases. + +For tfillpad: + - Input: full tile shape (rows x cols) + - Output: only valid region (valid_rows x valid_cols) + - Golden: valid region only +""" + +import os +import sys +import numpy as np + +from cases import CASES + + +def main(): + case_filter = sys.argv[1] if len(sys.argv) > 1 else None + + all_passed = True + for case in CASES: + if case_filter is not None and case["name"] != case_filter: + continue + + case_dir = case["name"] + dtype = case["dtype"] + valid_shape = case["valid_shape"] + eps = case["eps"] + + # Load golden and output (both stored with valid_shape) + golden = np.fromfile(os.path.join(case_dir, "golden.bin"), dtype=dtype).reshape(valid_shape) + output = np.fromfile(os.path.join(case_dir, "output.bin"), dtype=dtype).reshape(valid_shape) + + # For integer types, eps=0 means exact match + # For float types, use np.allclose with eps + if eps == 0: + # Integer comparison - exact match + if not np.array_equal(golden, output): + diff = golden - output + idx = int(np.argmax(np.abs(diff))) + print(f"[ERROR] {case['name']}: Mismatch at idx={idx} (golden={golden.flat[idx]}, output={output.flat[idx]})") + all_passed = False + else: + print(f"[INFO] {case['name']}: compare passed") + else: + # Float comparison - use allclose + # Convert to float64 for comparison (fp16 precision issues) + g = golden.astype(np.float64, copy=False) + o = output.astype(np.float64, copy=False) + + if g.shape != o.shape: + print(f"[ERROR] {case['name']}: Shape mismatch: golden {g.shape} vs output {o.shape}") + all_passed = False + continue + + if not np.allclose(g, o, atol=eps, rtol=eps, equal_nan=True): + abs_diff = np.abs(g - o) + idx = int(np.argmax(abs_diff)) + print(f"[ERROR] {case['name']}: Mismatch: max diff={float(abs_diff.flat[idx])} " + f"at idx={idx} (golden={g.flat[idx]}, output={o.flat[idx]})") + all_passed = False + else: + print(f"[INFO] {case['name']}: compare passed") + + if not all_passed: + sys.exit(2) + print("[INFO] all cases passed") + + +if __name__ == "__main__": + main() diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tfillpad/gen_data.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tfillpad/gen_data.py new file mode 100644 index 000000000..99b297b3c --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tfillpad/gen_data.py @@ -0,0 +1,117 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +"""Generate golden data for tfillpad test cases. + +TFILLPAD semantics: + 1. Copy src.valid_shape data to dst + 2. Fill cols from src.valid_cols to dst.cols with FillPadVal + 3. Fill rows from src.rows to dst.rows with FillPadVal + +Note: LoadPadVal is used by TLOAD only, TFILLPAD uses FillPadVal for expansion. +""" + +import os +import numpy as np +import struct + +from cases import CASES, PADVAL_MAX, PADVAL_MIN, PADVAL_NULL, PADVAL_ZERO, PADVAL_NEG1 + + +# FLT_MAX and -FLT_MAX (matching DSL PadValue.MAX/MIN) +def _float32_from_bits(bits: int) -> float: + return struct.unpack(">f", bits.to_bytes(4, byteorder="big", signed=False))[0] + +_FLT_MAX = _float32_from_bits(0x7F7FFFFF) # ~3.4028235e+38 +_FLT_MIN = _float32_from_bits(0xFF7FFFFF) # ~-3.4028235e+38 + + +def get_pad_value(dtype, padval_name): + """Get the actual pad value for a dtype based on PadValue enum. + + Matches DSL PadValue.materialize_scalar behavior: + - MAX: FLT_MAX for float (not inf), max for integers + - MIN: -FLT_MAX for float (not -inf), min for integers + - NEG1: -1.0 for float, -1 for integers + - NULL/ZERO: 0 + """ + if padval_name == PADVAL_MAX: + if np.issubdtype(dtype, np.floating): + return np.float32(_FLT_MAX) + else: + return np.iinfo(dtype).max + elif padval_name == PADVAL_MIN: + if np.issubdtype(dtype, np.floating): + return np.float32(_FLT_MIN) + else: + return np.iinfo(dtype).min + elif padval_name == PADVAL_NEG1: + if np.issubdtype(dtype, np.floating): + return np.float32(-1.0) + else: + return dtype(-1) + else: # PADVAL_NULL or PADVAL_ZERO + return dtype(0) + + +def setup_case_rng(case): + """Set a per-case deterministic random seed.""" + np.random.seed(hash(case["name"]) & 0xFFFFFFFF) + + +def save_case_data(case_name, data_dict): + """Create case directory and write {name}.bin for each entry.""" + os.makedirs(case_name, exist_ok=True) + for name, arr in data_dict.items(): + arr.tofile(os.path.join(case_name, f"{name}.bin")) + + +for case in CASES: + setup_case_rng(case) + + dtype = case["dtype"] + dst_shape = case["shape"] + dst_valid = case["valid_shape"] + src_shape = case.get("src_shape", dst_shape) + src_valid = case.get("src_valid_shape", dst_valid) + fill_padval = case.get("fill_padval", PADVAL_ZERO) + + # Input: generated with src_shape (matching C++ input size) + src_vr, src_vc = src_valid + input_data = np.zeros(src_shape, dtype=dtype) + input_data[:src_vr, :src_vc] = np.random.randint(1, 10, size=(src_vr, src_vc)).astype(dtype) + + # Golden: generated with dst_valid (output size) + dst_vr, dst_vc = dst_valid + golden = np.zeros(dst_valid, dtype=dtype) + + # Step 1: Copy src valid data to dst + copy_vr = min(src_vr, dst_vr) + copy_vc = min(src_vc, dst_vc) + golden[:copy_vr, :copy_vc] = input_data[:copy_vr, :copy_vc] + + # Step 2: TFILLPAD fills cols from src_valid_cols to dst_cols with FillPadVal + # (NOT LoadPadVal! TFILLPAD uses FillPadVal for expansion) + if dst_vc > src_vc: + fill_val = get_pad_value(dtype, fill_padval) + golden[:dst_vr, src_vc:dst_vc] = fill_val + + # Step 3: TFILLPAD fills rows from src_rows to dst_rows with FillPadVal + if dst_shape[0] > src_shape[0]: + fill_val = get_pad_value(dtype, fill_padval) + expand_rows_start = src_shape[0] + expand_rows_end = dst_vr + if expand_rows_end > expand_rows_start: + golden[expand_rows_start:expand_rows_end, :dst_vc] = fill_val + + save_case_data(case["name"], {"input": input_data, "golden": golden}) + print(f"[INFO] gen_data: {case['name']} input={src_shape} golden={dst_valid} " + f"fill_pad={fill_padval} dtype={dtype.__name__}") diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tfillpad/launch.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tfillpad/launch.cpp new file mode 100644 index 000000000..6c2eaa6a3 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tfillpad/launch.cpp @@ -0,0 +1,28 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +#include + +#ifndef AICORE +#define AICORE [aicore] +#endif + +// ========== Smoke case: float, 64x16, valid=64x7 ========== + +extern "C" __global__ AICORE void TFILLPAD_f32_64x16_pad_64x7(__gm__ float *src, __gm__ float *dst); +extern "C" __global__ AICORE void TFILLPAD_f32_260x16_pad_260x7(__gm__ float *src, __gm__ float *dst); + +void LaunchTFILLPAD_f32_64x16_pad_64x7(float *src, float *dst, void *stream) { + TFILLPAD_f32_64x16_pad_64x7<<<1, nullptr, stream>>>((__gm__ float *)src, (__gm__ float *)dst); +} + + + +void LaunchTFILLPAD_f32_260x16_pad_260x7(float *src, float *dst, void *stream) { + TFILLPAD_f32_260x16_pad_260x7<<<1, nullptr, stream>>>((__gm__ float *)src, (__gm__ float *)dst); +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tfillpad/main.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tfillpad/main.cpp new file mode 100644 index 000000000..83571a119 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tfillpad/main.cpp @@ -0,0 +1,151 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// Host driver for TileLang tfillpad ST (non-inplace mode). +// Matches C++ reference test cases: Cases 1, 2, 3, 4, 6, 7, 10, 11, 12, 13 +// Output size: dst valid region (dst tile physical shape for full output) + +#include "acl/acl.h" +#include "test_common.h" +#include +#include +#include +#include +#include +#include + +using namespace PtoTestCommon; + +// Kernel launch wrappers (defined in launch.cpp) +void LaunchTFILLPAD_f32_64x16_pad_64x7(float *src, float *dst, void *stream); +void LaunchTFILLPAD_f32_128x128_pad_128x127(float *src, float *dst, void *stream); +void LaunchTFILLPAD_f32_128x160_pad_128x127_v2(float *src, float *dst, void *stream); +void LaunchTFILLPAD_f32_260x16_pad_260x7(float *src, float *dst, void *stream); +void LaunchTFILLPAD_u16_260x32_pad_260x7(uint16_t *src, uint16_t *dst, void *stream); +void LaunchTFILLPAD_s16_260x32_pad_260x7(int16_t *src, int16_t *dst, void *stream); +void LaunchTFILLPAD_f32_128x128_pad_128x64_neg1(float *src, float *dst, void *stream); + +enum class DataType { F32, U16, S8, S16, S32 }; + +struct TestCase { + const char *name; + DataType dtype; + void (*launch)(void *, void *, void *); + size_t rows; // dst tile rows (physical) + size_t cols; // dst tile cols (physical) + size_t validRows; // dst valid rows (output rows) + size_t validCols; // dst valid cols (output cols) - CHANGED: now = dst physical cols for full output + size_t srcRows; // src tensor rows (0 means same as rows) + size_t srcCols; // src tensor cols (0 means same as cols) + size_t elemSize; +}; + +template +void wrapLaunch(void *src, void *dst, void *stream, void (*fn)(T *, T *, void *)) { + fn((T *)src, (T *)dst, stream); +} + +static const TestCase kCases[] = { +{"f32_64x16_pad_64x7", DataType::F32, + [](void *src, void *dst, void *stream) { wrapLaunch(src, dst, stream, LaunchTFILLPAD_f32_64x16_pad_64x7); }, + 64, 16, 64, 16, 64, 7, sizeof(float)}, +{"f32_260x16_pad_260x7", DataType::F32, + [](void *src, void *dst, void *stream) { wrapLaunch(src, dst, stream, LaunchTFILLPAD_f32_260x16_pad_260x7); }, + 260, 16, 260, 16, 260, 7, sizeof(float)}, +}; +static constexpr size_t kNumCases = sizeof(kCases) / sizeof(kCases[0]); + +static int RunCase(const TestCase &tc, int deviceId, aclrtStream stream) { + int rc = 0; + size_t srcRows = (tc.srcRows > 0) ? tc.srcRows : tc.rows; + size_t srcCols = (tc.srcCols > 0) ? tc.srcCols : tc.cols; + size_t inputElemCount = srcRows * srcCols; + size_t outputElemCount = tc.validRows * tc.validCols; + size_t inputFileSize = inputElemCount * tc.elemSize; + size_t outputFileSize = outputElemCount * tc.elemSize; + + std::printf("[INFO] === case: %s (src=%zux%zu, dst=%zux%zu, output=%zux%zu) ===\n", + tc.name, srcRows, srcCols, tc.rows, tc.cols, tc.validRows, tc.validCols); + + std::string caseDir = std::string("./") + tc.name; + + void *srcHost = nullptr, *dstHost = nullptr; + void *srcDevice = nullptr, *dstDevice = nullptr; + + aclrtMallocHost(&srcHost, inputFileSize); + aclrtMallocHost(&dstHost, outputFileSize); + + aclrtMalloc(&srcDevice, inputFileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc(&dstDevice, outputFileSize, ACL_MEM_MALLOC_HUGE_FIRST); + + if (!ReadFile((caseDir + "/input.bin").c_str(), inputFileSize, srcHost, inputFileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input.bin\n", caseDir.c_str()); + rc = 1; + } + + if (rc == 0) { + aclrtMemcpy(srcDevice, inputFileSize, srcHost, inputFileSize, ACL_MEMCPY_HOST_TO_DEVICE); + + tc.launch(srcDevice, dstDevice, stream); + + aclrtSynchronizeStream(stream); + aclrtMemcpy(dstHost, outputFileSize, dstDevice, outputFileSize, ACL_MEMCPY_DEVICE_TO_HOST); + } + + if (rc == 0 && !WriteFile((caseDir + "/output.bin").c_str(), dstHost, outputFileSize)) { + std::fprintf(stderr, "[ERROR] failed to write %s/output.bin\n", caseDir.c_str()); + rc = 1; + } + + if (srcDevice != nullptr) + aclrtFree(srcDevice); + if (dstDevice != nullptr) + aclrtFree(dstDevice); + if (srcHost != nullptr) + aclrtFreeHost(srcHost); + if (dstHost != nullptr) + aclrtFreeHost(dstHost); + + if (rc == 0) + std::printf("[INFO] case %s done\n", tc.name); + return rc; +} + +int main(int argc, char *argv[]) { + const char *caseFilter = (argc > 1) ? argv[1] : nullptr; + + int rc = 0; + int deviceId = 0; + aclrtStream stream = nullptr; + + aclInit(nullptr); + if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { + deviceId = std::atoi(envDevice); + } + aclrtSetDevice(deviceId); + aclrtCreateStream(&stream); + + for (size_t i = 0; i < kNumCases; ++i) { + if (caseFilter != nullptr && std::strcmp(kCases[i].name, caseFilter) != 0) { + continue; + } + int ret = RunCase(kCases[i], deviceId, stream); + if (ret != 0) { + std::fprintf(stderr, "[ERROR] case %s failed\n", kCases[i].name); + rc = 1; + break; + } + } + + if (stream != nullptr) + aclrtDestroyStream(stream); + aclrtResetDevice(deviceId); + aclFinalize(); + + return rc; +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tfillpad/tfillpad.pto b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tfillpad/tfillpad.pto new file mode 100644 index 000000000..635711274 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tfillpad/tfillpad.pto @@ -0,0 +1,116 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// TileLang ST kernels for pto.tfillpad (non-inplace mode). +// Matches C++ reference test cases: Cases 1, 2, 3, 4, 6, 7, 10, 11, 12, 13 +// Compiled by ptoas --enable-insert-sync --enable-tile-op-expand --pto-backend=vpto +// +// PadValue encoding: 0=Null, 1=Zero, 2=Max, 3=Min +// Cases 12/13 use Custom(-1.0f) which cannot be encoded in PTO IR, +// template uses shape-based detection for these cases. +// +// C++ template params: shape3=src_rows, shape4=src_cols, kTRows_=dst_rows, kTCols_=dst_cols + +module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { + // ========== Smoke case: float, src=64x7, dst=64x16, LoadPad=Min, FillPad=Max ========== + + func.func @TFILLPAD_f32_64x16_pad_64x7(%src_ptr: !pto.ptr, %dst_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c7 = arith.constant 7 : index + %c16 = arith.constant 16 : index + %c64 = arith.constant 64 : index + %c448 = arith.constant 448 : index + %c1024 = arith.constant 1024 : index + + %src_view = pto.make_tensor_view %src_ptr, + shape = [%c1, %c1, %c1, %c64, %c7], + strides = [%c448, %c448, %c448, %c7, %c1] + : !pto.tensor_view<1x1x1x64x7xf32> + %dst_out_view = pto.make_tensor_view %dst_ptr, + shape = [%c1, %c1, %c1, %c64, %c16], + strides = [%c1024, %c1024, %c1024, %c16, %c1] + : !pto.tensor_view<1x1x1x64x16xf32> + + %src_part = pto.partition_view %src_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c64, %c7] + : !pto.tensor_view<1x1x1x64x7xf32> -> !pto.partition_tensor_view<1x1x1x64x7xf32> + %dst_out_part = pto.partition_view %dst_out_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c64, %c16] + : !pto.tensor_view<1x1x1x64x16xf32> -> !pto.partition_tensor_view<1x1x1x64x16xf32> + + %src = pto.alloc_tile + : !pto.tile_buf + %dst = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%src_part : !pto.partition_tensor_view<1x1x1x64x7xf32>) + outs(%src : !pto.tile_buf) + + pto.tfillpad ins(%src : !pto.tile_buf) + outs(%dst : !pto.tile_buf) + + pto.tstore ins(%dst : !pto.tile_buf) + outs(%dst_out_part : !pto.partition_tensor_view<1x1x1x64x16xf32>) + return + } + + // ========== Case 1: float, src=128x127, dst=128x128, LoadPad=Max, FillPad=Max ========== + + + func.func @TFILLPAD_f32_260x16_pad_260x7(%src_ptr: !pto.ptr, %dst_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c7 = arith.constant 7 : index + %c16 = arith.constant 16 : index + %c260 = arith.constant 260 : index + %c1820 = arith.constant 1820 : index // 260*7 (src size) + %c4160 = arith.constant 4160 : index // 260*16 (dst size) + + // Src tensor_view: 260x7 + %src_view = pto.make_tensor_view %src_ptr, + shape = [%c1, %c1, %c1, %c260, %c7], + strides = [%c1820, %c1820, %c1820, %c7, %c1] + : !pto.tensor_view<1x1x1x260x7xf32> + // Dst tensor_view: 260x16 + %dst_out_view = pto.make_tensor_view %dst_ptr, + shape = [%c1, %c1, %c1, %c260, %c16], + strides = [%c4160, %c4160, %c4160, %c16, %c1] + : !pto.tensor_view<1x1x1x260x16xf32> + + %src_part = pto.partition_view %src_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c260, %c7] + : !pto.tensor_view<1x1x1x260x7xf32> -> !pto.partition_tensor_view<1x1x1x260x7xf32> + %dst_out_part = pto.partition_view %dst_out_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c260, %c16] + : !pto.tensor_view<1x1x1x260x16xf32> -> !pto.partition_tensor_view<1x1x1x260x16xf32> + + // Src tile: LoadPadVal=Min (pad=3), src physical=260x16, v_col=7 + %src = pto.alloc_tile + : !pto.tile_buf + // Dst tile: FillPadVal=Max (pad=2), dst physical=260x16, v_col=16 (full output) + %dst = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%src_part : !pto.partition_tensor_view<1x1x1x260x7xf32>) + outs(%src : !pto.tile_buf) + + pto.tfillpad ins(%src : !pto.tile_buf) + outs(%dst : !pto.tile_buf) + + pto.tstore ins(%dst : !pto.tile_buf) + outs(%dst_out_part : !pto.partition_tensor_view<1x1x1x260x16xf32>) + return + } + + // ========== Case 6: uint16, src=260x7, dst=260x32, LoadPad=Min, FillPad=Max ========== +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tfillpad_expand/CMakeLists.txt b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tfillpad_expand/CMakeLists.txt new file mode 100644 index 000000000..a4de40928 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tfillpad_expand/CMakeLists.txt @@ -0,0 +1,9 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +pto_tilelang_vec_st(tfillpad_expand) diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tfillpad_expand/cases.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tfillpad_expand/cases.py new file mode 100644 index 000000000..93e7cc628 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tfillpad_expand/cases.py @@ -0,0 +1,92 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +"""Single source of truth for tfillpad_expand ST test cases. + +Matches C++ reference test cases: Cases 8, 9 + +C++ expand mode parameters: + - shape3: src physical rows + - shape4: src physical cols + - kTRows_: dst physical rows + - kTCols_: dst physical cols + - expand=true: TFILLPAD_EXPAND copies src valid data, fills expansion with FillPadVal + +Case 8: runTFILLPAD +Case 9: runTFILLPAD + +Each case defines: + - name: case identifier + - dtype: numpy dtype + - shape: (rows, cols) — src tile physical dimensions (input size) + - valid_shape: (valid_rows, valid_cols) — src valid region + - dst_shape: (rows, cols) — dst tile physical dimensions + - dst_valid_shape: (valid_rows, valid_cols) — dst valid region (output size) + - load_padval: PadValue for TLOAD (fill invalid columns in src tile) + - fill_padval: PadValue for TFILLPAD_EXPAND (fill expansion region in dst) + - eps: tolerance for numpy.allclose +""" + +import numpy as np + +# PadValue enum values matching C++ definition +PADVAL_MAX = "Max" # FLT_MAX for float, MAX for integers +PADVAL_MIN = "Min" # -FLT_MAX for float, MIN for integers +PADVAL_NULL = "Null" # no fill +PADVAL_ZERO = "Zero" # zero fill +PADVAL_NEG1 = "Neg1" # -1.0f for float, -1 for integers (Custom) + +CASES = [ + { + "name": "u16_64x16_src_63x7", + "dtype": np.uint16, + "shape": (63, 7), + "valid_shape": (63, 7), + "dst_shape": (64, 16), + "dst_valid_shape": (64, 16), + "load_padval": PADVAL_MIN, + "fill_padval": PADVAL_MAX, + "eps": 0, + }, + # ========== Case 1: uint16, src=259x7, dst=260x32, expand, LoadPad=Min, FillPad=Max ========== + + { + "name": "u16_260x32_src_259x7", + "dtype": np.uint16, + "shape": (259, 7), # src physical (C++ shape3=259, shape4=7) + "valid_shape": (259, 7), # src valid region (actual data) + "dst_shape": (260, 32), # dst physical + "dst_valid_shape": (260, 32), # dst valid (output size) + "load_padval": PADVAL_MIN, # TLOAD: fill cols 7-31 with MIN (uint16 MIN=0) + "fill_padval": PADVAL_MAX, # TFILLPAD_EXPAND: fill expansion region with MAX (uint16 MAX=65535) + "eps": 0, + }, + + # ========== Case 2: int8, src=259x7, dst=260x64, expand, LoadPad=Min, FillPad=Max ========== + + { + "name": "s8_260x64_src_259x7", + "dtype": np.int8, + "shape": (259, 7), # src physical (C++ shape3=259, shape4=7) + "valid_shape": (259, 7), # src valid region (actual data) + "dst_shape": (260, 64), # dst physical + "dst_valid_shape": (260, 64), # dst valid (output size) + "load_padval": PADVAL_MIN, # TLOAD: fill cols 7-63 with MIN (int8 MIN=-128) + "fill_padval": PADVAL_MAX, # TFILLPAD_EXPAND: fill expansion region with MAX (127) + "eps": 0, + }, +] + +_SMOKE_CASE_NAMES = ['u16_64x16_src_63x7', 'u16_260x32_src_259x7'] +_SMOKE_CASE_NAME_SET = set(_SMOKE_CASE_NAMES) +_missing = [name for name in _SMOKE_CASE_NAMES if name not in {case["name"] for case in CASES}] +if _missing: + raise RuntimeError("unknown smoke case(s): " + ", ".join(_missing)) +CASES = [case for case in CASES if case["name"] in _SMOKE_CASE_NAME_SET] diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tfillpad_expand/compare.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tfillpad_expand/compare.py new file mode 100644 index 000000000..160965d06 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tfillpad_expand/compare.py @@ -0,0 +1,75 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +"""Compare output against golden for tfillpad_expand test cases.""" + +import os +import sys +import numpy as np + +from cases import CASES + + +def main(): + case_filter = sys.argv[1] if len(sys.argv) > 1 else None + + all_passed = True + for case in CASES: + if case_filter is not None and case["name"] != case_filter: + continue + + case_dir = case["name"] + dtype = case["dtype"] + dst_shape = case["dst_shape"] + eps = case["eps"] + + # Load golden and output (both stored with dst_shape) + golden = np.fromfile(os.path.join(case_dir, "golden.bin"), dtype=dtype).reshape(dst_shape) + output = np.fromfile(os.path.join(case_dir, "output.bin"), dtype=dtype).reshape(dst_shape) + + # For integer types, eps=0 means exact match + # For float types, use np.allclose with eps + if eps == 0: + # Integer comparison - exact match + if not np.array_equal(golden, output): + diff = golden - output + idx = int(np.argmax(np.abs(diff))) + print(f"[ERROR] {case['name']}: Mismatch at idx={idx} (golden={golden.flat[idx]}, output={output.flat[idx]})") + all_passed = False + else: + print(f"[INFO] {case['name']}: compare passed") + else: + # Float comparison - use allclose + # Convert to float64 for comparison (fp16 precision issues) + g = golden.astype(np.float64, copy=False) + o = output.astype(np.float64, copy=False) + + if g.shape != o.shape: + print(f"[ERROR] {case['name']}: Shape mismatch: golden {g.shape} vs output {o.shape}") + all_passed = False + continue + + if not np.allclose(g, o, atol=eps, rtol=eps, equal_nan=True): + abs_diff = np.abs(g - o) + idx = int(np.argmax(abs_diff)) + print(f"[ERROR] {case['name']}: Mismatch: max diff={float(abs_diff.flat[idx])} " + f"at idx={idx} (golden={g.flat[idx]}, output={o.flat[idx]})") + all_passed = False + else: + print(f"[INFO] {case['name']}: compare passed") + + if not all_passed: + sys.exit(2) + print("[INFO] all cases passed") + + +if __name__ == "__main__": + main() diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tfillpad_expand/gen_data.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tfillpad_expand/gen_data.py new file mode 100644 index 000000000..12dd2528e --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tfillpad_expand/gen_data.py @@ -0,0 +1,114 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +"""Generate golden data for tfillpad_expand test cases. + +TFILLPAD_EXPAND semantics: + 1. Copy src.valid_shape data to dst + 2. Fill cols from src.valid_cols to dst.valid_cols with FillPadVal + 3. Fill rows from src.rows to dst.rows with FillPadVal + +Note: LoadPadVal is used by TLOAD only, TFILLPAD_EXPAND uses FillPadVal for expansion. +""" + +import os +import numpy as np +import struct + +from cases import CASES, PADVAL_MAX, PADVAL_MIN, PADVAL_NEG1, PADVAL_ZERO + + +# FLT_MAX and -FLT_MAX (matching DSL PadValue.MAX/MIN) +def _float32_from_bits(bits: int) -> float: + return struct.unpack(">f", bits.to_bytes(4, byteorder="big", signed=False))[0] + +_FLT_MAX = _float32_from_bits(0x7F7FFFFF) # ~3.4028235e+38 +_FLT_MIN = _float32_from_bits(0xFF7FFFFF) # ~-3.4028235e+38 + + +def get_pad_value(dtype, padval_name): + """Get the actual pad value for a dtype based on PadValue enum. + + Matches DSL PadValue.materialize_scalar behavior: + - MAX: FLT_MAX for float (not inf), max for integers + - MIN: -FLT_MAX for float (not -inf), min for integers + - NEG1: -1.0 for float, -1 for integers + - NULL/ZERO: 0 + """ + if padval_name == PADVAL_MAX: + if np.issubdtype(dtype, np.floating): + return np.float32(_FLT_MAX) + else: + return np.iinfo(dtype).max + elif padval_name == PADVAL_MIN: + if np.issubdtype(dtype, np.floating): + return np.float32(_FLT_MIN) + else: + return np.iinfo(dtype).min + elif padval_name == PADVAL_NEG1: + if np.issubdtype(dtype, np.floating): + return np.float32(-1.0) + else: + return dtype(-1) + else: # PADVAL_NULL or PADVAL_ZERO + return dtype(0) + + +def setup_case_rng(case): + """Set a per-case deterministic random seed.""" + np.random.seed(hash(case["name"]) & 0xFFFFFFFF) + + +def save_case_data(case_name, data_dict): + """Create case directory and write {name}.bin for each entry.""" + os.makedirs(case_name, exist_ok=True) + for name, arr in data_dict.items(): + arr.tofile(os.path.join(case_name, f"{name}.bin")) + + +for case in CASES: + setup_case_rng(case) + + dtype = case["dtype"] + src_shape = case["shape"] # src physical (input size, matching tensor_view) + src_valid = case["valid_shape"] # src valid region (actual data in input) + dst_shape = case["dst_shape"] # dst physical + dst_valid = case["dst_valid_shape"] # dst valid (output size) + fill_padval = case.get("fill_padval", PADVAL_ZERO) + + src_vr, src_vc = src_valid + dst_vr, dst_vc = dst_valid + + # Generate input: random values in src valid region, zeros elsewhere + # Input size = src_shape (matching tensor_view and C++ input) + input_data = np.zeros(src_shape, dtype=dtype) + input_data[:src_vr, :src_vc] = np.random.randint(1, 10, size=(src_vr, src_vc)).astype(dtype) + + # Generate golden: dst valid region (output size) + golden = np.zeros(dst_valid, dtype=dtype) + + # Step 1: Copy src valid data to dst + copy_vr = min(src_vr, dst_vr) + copy_vc = min(src_vc, dst_vc) + golden[:copy_vr, :copy_vc] = input_data[:copy_vr, :copy_vc] + + # Step 2: Fill column expansion region (cols from src_vc to dst_vc) + if dst_vc > src_vc: + fill_val = get_pad_value(dtype, fill_padval) + golden[:dst_vr, src_vc:dst_vc] = fill_val + + # Step 3: Fill row expansion region (rows from src_vr to dst_vr) + if dst_vr > src_vr: + fill_val = get_pad_value(dtype, fill_padval) + golden[src_vr:dst_vr, :dst_vc] = fill_val + + save_case_data(case["name"], {"input": input_data, "golden": golden}) + print(f"[INFO] gen_data: {case['name']} src={src_shape} valid={src_valid} -> dst={dst_shape} " + f"fill_pad={fill_padval} dtype={dtype.__name__}") diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tfillpad_expand/launch.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tfillpad_expand/launch.cpp new file mode 100644 index 000000000..f6d4a56b8 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tfillpad_expand/launch.cpp @@ -0,0 +1,28 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +#include + +#ifndef AICORE +#define AICORE [aicore] +#endif + +// ========== Smoke uint16 kernel ========== + +extern "C" __global__ AICORE void TFILLPAD_EXPAND_u16_64x16_src_63x7(__gm__ uint16_t *src, __gm__ uint16_t *dst); +extern "C" __global__ AICORE void TFILLPAD_EXPAND_u16_260x32_src_259x7(__gm__ uint16_t *src, __gm__ uint16_t *dst); + +void LaunchTFILLPAD_EXPAND_u16_64x16_src_63x7(uint16_t *src, uint16_t *dst, void *stream) { + TFILLPAD_EXPAND_u16_64x16_src_63x7<<<1, nullptr, stream>>>((__gm__ uint16_t *)src, (__gm__ uint16_t *)dst); +} + + + +void LaunchTFILLPAD_EXPAND_u16_260x32_src_259x7(uint16_t *src, uint16_t *dst, void *stream) { + TFILLPAD_EXPAND_u16_260x32_src_259x7<<<1, nullptr, stream>>>((__gm__ uint16_t *)src, (__gm__ uint16_t *)dst); +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tfillpad_expand/main.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tfillpad_expand/main.cpp new file mode 100644 index 000000000..90537df1b --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tfillpad_expand/main.cpp @@ -0,0 +1,148 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// Host driver for TileLang tfillpad_expand ST — case-table driven. +// Each case launches a different kernel variant, reads/writes from per-case subdirectory. +// Numerical comparison is done externally by compare.py. + +#include "acl/acl.h" +#include "test_common.h" +#include +#include +#include +#include +#include +#include + +using namespace PtoTestCommon; + +// Kernel launch wrappers (defined in launch.cpp) +void LaunchTFILLPAD_EXPAND_u16_64x16_src_63x7(uint16_t *src, uint16_t *dst, void *stream); +void LaunchTFILLPAD_EXPAND_u16_260x32_src_259x7(uint16_t *src, uint16_t *dst, void *stream); + +enum class DataType { U16, S8 }; + +struct TestCase { + const char *name; + DataType dtype; + void (*launch)(void *, void *, void *); // Generic launch function pointer + size_t srcRows; + size_t srcCols; + size_t srcValidRows; + size_t srcValidCols; + size_t dstRows; + size_t dstCols; + size_t dstValidRows; + size_t dstValidCols; + size_t elemSize; +}; + +// Helper to wrap type-specific launch functions +template +void wrapLaunch(void *src, void *dst, void *stream, void (*fn)(T *, T *, void *)) { + fn((T *)src, (T *)dst, stream); +} + +static const TestCase kCases[] = { +{"u16_64x16_src_63x7", DataType::U16, + [](void *src, void *dst, void *stream) { wrapLaunch(src, dst, stream, LaunchTFILLPAD_EXPAND_u16_64x16_src_63x7); }, + 64, 16, 63, 7, 64, 16, 64, 16, sizeof(uint16_t)}, +{"u16_260x32_src_259x7", DataType::U16, + [](void *src, void *dst, void *stream) { wrapLaunch(src, dst, stream, LaunchTFILLPAD_EXPAND_u16_260x32_src_259x7); }, + 260, 32, 259, 7, 260, 32, 260, 32, sizeof(uint16_t)}, +}; +static constexpr size_t kNumCases = sizeof(kCases) / sizeof(kCases[0]); + +static int RunCase(const TestCase &tc, int deviceId, aclrtStream stream) { + int rc = 0; + size_t srcElemCount = tc.srcRows * tc.srcCols; + size_t dstElemCount = tc.dstRows * tc.dstCols; + size_t srcFileSize = srcElemCount * tc.elemSize; + size_t dstFileSize = dstElemCount * tc.elemSize; + + std::printf("[INFO] === case: %s (src=%zux%zu valid=%zux%zu -> dst=%zux%zu) ===\n", + tc.name, tc.srcRows, tc.srcCols, tc.srcValidRows, tc.srcValidCols, tc.dstRows, tc.dstCols); + + std::string caseDir = std::string("./") + tc.name; + size_t inputFileSize = srcFileSize; + + void *srcHost = nullptr, *dstHost = nullptr; + void *srcDevice = nullptr, *dstDevice = nullptr; + + aclrtMallocHost(&srcHost, srcFileSize); + aclrtMallocHost(&dstHost, dstFileSize); + + aclrtMalloc(&srcDevice, srcFileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc(&dstDevice, dstFileSize, ACL_MEM_MALLOC_HUGE_FIRST); + + if (!ReadFile((caseDir + "/input.bin").c_str(), inputFileSize, srcHost, srcFileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input.bin\n", caseDir.c_str()); + rc = 1; + } + + if (rc == 0) { + aclrtMemcpy(srcDevice, srcFileSize, srcHost, srcFileSize, ACL_MEMCPY_HOST_TO_DEVICE); + + tc.launch(srcDevice, dstDevice, stream); + + aclrtSynchronizeStream(stream); + aclrtMemcpy(dstHost, dstFileSize, dstDevice, dstFileSize, ACL_MEMCPY_DEVICE_TO_HOST); + } + + if (rc == 0 && !WriteFile((caseDir + "/output.bin").c_str(), dstHost, dstFileSize)) { + std::fprintf(stderr, "[ERROR] failed to write %s/output.bin\n", caseDir.c_str()); + rc = 1; + } + + if (srcDevice != nullptr) + aclrtFree(srcDevice); + if (dstDevice != nullptr) + aclrtFree(dstDevice); + if (srcHost != nullptr) + aclrtFreeHost(srcHost); + if (dstHost != nullptr) + aclrtFreeHost(dstHost); + + if (rc == 0) + std::printf("[INFO] case %s done\n", tc.name); + return rc; +} + +int main(int argc, char *argv[]) { + const char *caseFilter = (argc > 1) ? argv[1] : nullptr; + + int rc = 0; + int deviceId = 0; + aclrtStream stream = nullptr; + + aclInit(nullptr); + if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { + deviceId = std::atoi(envDevice); + } + aclrtSetDevice(deviceId); + aclrtCreateStream(&stream); + + for (size_t i = 0; i < kNumCases; ++i) { + if (caseFilter != nullptr && std::strcmp(kCases[i].name, caseFilter) != 0) { + continue; + } + int ret = RunCase(kCases[i], deviceId, stream); + if (ret != 0) { + std::fprintf(stderr, "[ERROR] case %s failed\n", kCases[i].name); + rc = 1; + break; + } + } + + if (stream != nullptr) + aclrtDestroyStream(stream); + aclrtResetDevice(deviceId); + aclFinalize(); + + return rc; +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tfillpad_expand/tfillpad_expand.pto b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tfillpad_expand/tfillpad_expand.pto new file mode 100644 index 000000000..cb03e4aed --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tfillpad_expand/tfillpad_expand.pto @@ -0,0 +1,120 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// TileLang ST kernels for pto.tfillpad_expand: copy src to dst and fill padding. +// Matches C++ test cases: case 8, 9 +// Compiled by ptoas --enable-insert-sync --enable-tile-op-expand --pto-backend=vpto +// +// PadValue encoding: 0=Null, 1=Zero, 2=Max, 3=Min +// Case 8: uint16, LoadPad=Min(pad=3), FillPad=Max(pad=2) +// Case 9: int8, LoadPad=Min(pad=3), FillPad=Max(pad=2) +// +// C++ template params: shape3=src_rows, shape4=src_cols, kTRows_=dst_rows, kTCols_=dst_cols + +module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { + // ========== Smoke case: uint16, src=63x7, dst=64x16, LoadPad=Min, FillPad=Max ========== + + func.func @TFILLPAD_EXPAND_u16_64x16_src_63x7(%src_ptr: !pto.ptr, %dst_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c7 = arith.constant 7 : index + %c16 = arith.constant 16 : index + %c63 = arith.constant 63 : index + %c64 = arith.constant 64 : index + %c441 = arith.constant 441 : index + %c1024 = arith.constant 1024 : index + + %src_view = pto.make_tensor_view %src_ptr, + shape = [%c1, %c1, %c1, %c63, %c7], + strides = [%c441, %c441, %c441, %c7, %c1] + : !pto.tensor_view<1x1x1x63x7xui16> + + %dst_view = pto.make_tensor_view %dst_ptr, + shape = [%c1, %c1, %c1, %c64, %c16], + strides = [%c1024, %c1024, %c1024, %c16, %c1] + : !pto.tensor_view<1x1x1x64x16xui16> + + %src_part = pto.partition_view %src_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c63, %c7] + : !pto.tensor_view<1x1x1x63x7xui16> -> !pto.partition_tensor_view<1x1x1x63x7xui16> + %dst_part = pto.partition_view %dst_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c64, %c16] + : !pto.tensor_view<1x1x1x64x16xui16> -> !pto.partition_tensor_view<1x1x1x64x16xui16> + + %src = pto.alloc_tile + : !pto.tile_buf + %dst = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%src_part : !pto.partition_tensor_view<1x1x1x63x7xui16>) + outs(%src : !pto.tile_buf) + + pto.tfillpad_expand ins(%src : !pto.tile_buf) + outs(%dst : !pto.tile_buf) + + pto.tstore ins(%dst : !pto.tile_buf) + outs(%dst_part : !pto.partition_tensor_view<1x1x1x64x16xui16>) + return + } + + // ========== Case 8: uint16, src=259x7, dst=260x32, LoadPad=Min, FillPad=Max ========== + + + func.func @TFILLPAD_EXPAND_u16_260x32_src_259x7(%src_ptr: !pto.ptr, %dst_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c7 = arith.constant 7 : index + %c32 = arith.constant 32 : index + %c259 = arith.constant 259 : index + %c260 = arith.constant 260 : index + %c1813 = arith.constant 1813 : index // 259*7 (src size) + %c8320 = arith.constant 8320 : index // 260*32 (dst size) + + // Src tensor_view: 259x7 + %src_view = pto.make_tensor_view %src_ptr, + shape = [%c1, %c1, %c1, %c259, %c7], + strides = [%c1813, %c1813, %c1813, %c7, %c1] + : !pto.tensor_view<1x1x1x259x7xui16> + + // Dst tensor_view: 260x32 + %dst_view = pto.make_tensor_view %dst_ptr, + shape = [%c1, %c1, %c1, %c260, %c32], + strides = [%c8320, %c8320, %c8320, %c32, %c1] + : !pto.tensor_view<1x1x1x260x32xui16> + + %src_part = pto.partition_view %src_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c259, %c7] + : !pto.tensor_view<1x1x1x259x7xui16> -> !pto.partition_tensor_view<1x1x1x259x7xui16> + %dst_part = pto.partition_view %dst_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c260, %c32] + : !pto.tensor_view<1x1x1x260x32xui16> -> !pto.partition_tensor_view<1x1x1x260x32xui16> + + // Src tile: LoadPadVal=Min (pad=3), src physical=260x32, v_row=259, v_col=7 + %src = pto.alloc_tile + : !pto.tile_buf + // Dst tile: FillPadVal=Max (pad=2), dst physical=260x32, v_row=260, v_col=32 (full output) + %dst = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%src_part : !pto.partition_tensor_view<1x1x1x259x7xui16>) + outs(%src : !pto.tile_buf) + + pto.tfillpad_expand ins(%src : !pto.tile_buf) + outs(%dst : !pto.tile_buf) + + pto.tstore ins(%dst : !pto.tile_buf) + outs(%dst_part : !pto.partition_tensor_view<1x1x1x260x32xui16>) + return + } + + // ========== Case 9: int8, src=259x7, dst=260x64, LoadPad=Min, FillPad=Max ========== +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tfillpad_inplace/CMakeLists.txt b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tfillpad_inplace/CMakeLists.txt new file mode 100644 index 000000000..22c7fe7e7 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tfillpad_inplace/CMakeLists.txt @@ -0,0 +1,9 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +pto_tilelang_vec_st(tfillpad_inplace PTO_LEVEL level3) diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tfillpad_inplace/cases.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tfillpad_inplace/cases.py new file mode 100644 index 000000000..60e45f45c --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tfillpad_inplace/cases.py @@ -0,0 +1,55 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +"""Single source of truth for tfillpad_inplace ST test cases. + +Matches C++ reference test case: Case 5 + +Each case defines: + - name: case identifier + - dtype: numpy dtype + - shape: (rows, cols) — tile dimensions (physical buffer size) + - valid_shape: (valid_rows, valid_cols) — valid region (smaller than shape) + - eps: tolerance for numpy.allclose +""" + +import numpy as np + +CASES = [ + { + "name": "f32_64x16_noexpand", + "dtype": np.float32, + "src_shape": (64, 16), + "src_valid": (64, 16), + "dst_shape": (64, 16), + "dst_valid": (64, 16), + "fill_padval": "Max", + "eps": 1e-6, + }, + # ========== Case: float, src_valid == dst_valid (no expansion) ========== + + { + "name": "f32_260x16_noexpand", + "dtype": np.float32, + "src_shape": (260, 16), # src physical + "src_valid": (260, 16), # src valid = dst valid (no expansion) + "dst_shape": (260, 16), # dst physical + "dst_valid": (260, 16), # dst valid = full output + "fill_padval": "Max", # FillPadVal (not used since no expansion) + "eps": 1e-6, + }, +] + +_SMOKE_CASE_NAMES = ['f32_64x16_noexpand', 'f32_260x16_noexpand'] +_SMOKE_CASE_NAME_SET = set(_SMOKE_CASE_NAMES) +_missing = [name for name in _SMOKE_CASE_NAMES if name not in {case["name"] for case in CASES}] +if _missing: + raise RuntimeError("unknown smoke case(s): " + ", ".join(_missing)) +CASES = [case for case in CASES if case["name"] in _SMOKE_CASE_NAME_SET] diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tfillpad_inplace/compare.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tfillpad_inplace/compare.py new file mode 100644 index 000000000..af501160d --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tfillpad_inplace/compare.py @@ -0,0 +1,80 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +"""Compare output against golden for tfillpad_inplace test cases. + +For tfillpad_inplace: + - Input: full tile shape (rows x cols) + - Output: full tile shape (rows x cols) after inplace fill + - Golden: full tile shape +""" + +import os +import sys +import numpy as np + +from cases import CASES + + +def main(): + case_filter = sys.argv[1] if len(sys.argv) > 1 else None + + all_passed = True + for case in CASES: + if case_filter is not None and case["name"] != case_filter: + continue + + case_dir = case["name"] + dtype = case["dtype"] + dst_shape = case["dst_shape"] + eps = case["eps"] + + # Load golden and output (both stored with dst_shape) + golden = np.fromfile(os.path.join(case_dir, "golden.bin"), dtype=dtype).reshape(dst_shape) + output = np.fromfile(os.path.join(case_dir, "output.bin"), dtype=dtype).reshape(dst_shape) + + # For integer types, eps=0 means exact match + # For float types, use np.allclose with eps + if eps == 0: + # Integer comparison - exact match + if not np.array_equal(golden, output): + diff = golden - output + idx = int(np.argmax(np.abs(diff))) + print(f"[ERROR] {case['name']}: Mismatch at idx={idx} (golden={golden.flat[idx]}, output={output.flat[idx]})") + all_passed = False + else: + print(f"[INFO] {case['name']}: compare passed") + else: + # Float comparison - use allclose + # Convert to float64 for comparison (fp16 precision issues) + g = golden.astype(np.float64, copy=False) + o = output.astype(np.float64, copy=False) + + if g.shape != o.shape: + print(f"[ERROR] {case['name']}: Shape mismatch: golden {g.shape} vs output {o.shape}") + all_passed = False + continue + + if not np.allclose(g, o, atol=eps, rtol=eps, equal_nan=True): + abs_diff = np.abs(g - o) + idx = int(np.argmax(abs_diff)) + print(f"[ERROR] {case['name']}: Mismatch: max diff={float(abs_diff.flat[idx])} " + f"at idx={idx} (golden={g.flat[idx]}, output={o.flat[idx]})") + all_passed = False + else: + print(f"[INFO] {case['name']}: compare passed") + + if not all_passed: + sys.exit(2) + print("[INFO] all cases passed") + + +if __name__ == "__main__": + main() diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tfillpad_inplace/gen_data.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tfillpad_inplace/gen_data.py new file mode 100644 index 000000000..5dc09477a --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tfillpad_inplace/gen_data.py @@ -0,0 +1,99 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +"""Generate golden data for tfillpad_inplace test cases. + +For tfillpad_inplace: + - Only one tile, valid_shape smaller than tile shape + - Input: full tile shape (rows x cols), random values in valid region, zeros in padding + - Golden: full tile shape with valid region copied and padding filled with MAX (PadValue.Max) +""" + +import os +import numpy as np +import struct + +from cases import CASES + +# FLT_MAX for float (matching DSL PadValue.MAX) +def _float32_from_bits(bits: int) -> float: + return struct.unpack(">f", bits.to_bytes(4, byteorder="big", signed=False))[0] + +_FLT_MAX = _float32_from_bits(0x7F7FFFFF) # ~3.4028235e+38 + + +def get_pad_value(dtype, padval_name): + """Get the actual pad value for a dtype based on PadValue enum.""" + if padval_name == "Max": + if np.issubdtype(dtype, np.floating): + return np.float32(_FLT_MAX) + else: + return np.iinfo(dtype).max + elif padval_name == "Min": + if np.issubdtype(dtype, np.floating): + return np.float32(-_FLT_MAX) + else: + return np.iinfo(dtype).min + elif padval_name == "Zero": + return dtype(0) + else: + return dtype(0) + + +def setup_case_rng(case): + """Set a per-case deterministic random seed.""" + np.random.seed(hash(case["name"]) & 0xFFFFFFFF) + + +def save_case_data(case_name, data_dict): + """Create case directory and write {name}.bin for each entry.""" + os.makedirs(case_name, exist_ok=True) + for name, arr in data_dict.items(): + arr.tofile(os.path.join(case_name, f"{name}.bin")) + + +for case in CASES: + setup_case_rng(case) + + dtype = case["dtype"] + src_shape = case["src_shape"] + src_valid = case["src_valid"] + dst_shape = case["dst_shape"] + dst_valid = case["dst_valid"] + fill_padval = case.get("fill_padval", "Max") + + src_vr, src_vc = src_valid + dst_r, dst_c = dst_shape + dst_vr, dst_vc = dst_valid + + # Input: src valid region data (random values) + input_data = np.random.uniform(1.0, 10.0, size=(src_vr, src_vc)).astype(dtype) + + # Golden: dst full region + # Copy src.valid region to dst[:src_vr, :src_vc] + # Fill cols src_vc to dst_vc with FillPadVal + # Fill rows src_vr to dst_vr with FillPadVal (row expansion, if any) + golden = np.zeros(dst_shape, dtype=dtype) + golden[:src_vr, :src_vc] = input_data + + # Fill column padding (cols src_vc to dst_vc) + if dst_vc > src_vc: + fill_val = get_pad_value(dtype, fill_padval) + golden[:dst_vr, src_vc:dst_vc] = fill_val + + # Fill row padding (rows src_vr to dst_vr) + if dst_vr > src_vr: + fill_val = get_pad_value(dtype, fill_padval) + golden[src_vr:dst_vr, :dst_vc] = fill_val + + save_case_data(case["name"], {"input": input_data, "golden": golden}) + print(f"[INFO] gen_data: {case['name']} " + f"src_valid={src_valid} dst_shape={dst_shape} " + f"fill_pad={fill_padval} dtype={dtype.__name__}") diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tfillpad_inplace/launch.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tfillpad_inplace/launch.cpp new file mode 100644 index 000000000..42a9ff25f --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tfillpad_inplace/launch.cpp @@ -0,0 +1,30 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +#include + +#ifndef AICORE +#define AICORE [aicore] +#endif + +// ========== Smoke case: float, 64x16, no expansion (inplace single buffer) ========== + +extern "C" __global__ AICORE void TFILLPAD_INPLACE_f32_64x16_noexpand(__gm__ float *buf); +extern "C" __global__ AICORE void TFILLPAD_INPLACE_f32_260x16_noexpand(__gm__ float *buf); + +void LaunchTFILLPAD_INPLACE_f32_260x16_noexpand(float *buf, float *dummy, void *stream) { + // Inplace kernel: single buffer, src == dst physically + // dummy parameter ignored, only buf is used + TFILLPAD_INPLACE_f32_260x16_noexpand<<<1, nullptr, stream>>>((__gm__ float *)buf); +} + + +void LaunchTFILLPAD_INPLACE_f32_64x16_noexpand(float *buf, float *dummy, void *stream) { + (void)dummy; + TFILLPAD_INPLACE_f32_64x16_noexpand<<<1, nullptr, stream>>>((__gm__ float *)buf); +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tfillpad_inplace/main.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tfillpad_inplace/main.cpp new file mode 100644 index 000000000..d42917186 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tfillpad_inplace/main.cpp @@ -0,0 +1,134 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// Host driver for TileLang tfillpad_inplace ST. +// Matches C++ reference test case: Case 5 + +#include "acl/acl.h" +#include "test_common.h" +#include +#include +#include +#include +#include +#include + +using namespace PtoTestCommon; + +// Kernel launch wrapper (defined in launch.cpp) +// Inplace kernel takes single buffer pointer +void LaunchTFILLPAD_INPLACE_f32_64x16_noexpand(float *buf, float *dummy, void *stream); +void LaunchTFILLPAD_INPLACE_f32_260x16_noexpand(float *buf, float *dummy, void *stream); + +enum class DataType { F32 }; + +struct TestCase { + const char *name; + DataType dtype; + void (*launch)(float *, float *, void *); + size_t rows; + size_t cols; + size_t validRows; + size_t validCols; + size_t elemSize; +}; + +static const TestCase kCases[] = { +{"f32_64x16_noexpand", DataType::F32, + LaunchTFILLPAD_INPLACE_f32_64x16_noexpand, + 64, 16, 64, 16, sizeof(float)}, +{"f32_260x16_noexpand", DataType::F32, + LaunchTFILLPAD_INPLACE_f32_260x16_noexpand, + 260, 16, 260, 16, sizeof(float)}, +}; +static constexpr size_t kNumCases = sizeof(kCases) / sizeof(kCases[0]); + +static int RunCase(const TestCase &tc, int deviceId, aclrtStream stream) { + int rc = 0; + size_t elemCount = tc.rows * tc.cols; + size_t fileSize = elemCount * tc.elemSize; + + std::printf("[INFO] === case: %s (%zux%zu, inplace) ===\n", + tc.name, tc.validRows, tc.validCols); + + std::string caseDir = std::string("./") + tc.name; + + // Single buffer for inplace operation + void *bufHost = nullptr; + void *bufDevice = nullptr; + + aclrtMallocHost(&bufHost, fileSize); + aclrtMalloc(&bufDevice, fileSize, ACL_MEM_MALLOC_HUGE_FIRST); + + // Load input data into the single buffer + if (!ReadFile((caseDir + "/input.bin").c_str(), fileSize, bufHost, fileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input.bin\n", caseDir.c_str()); + rc = 1; + } + + if (rc == 0) { + // Copy input to device buffer + aclrtMemcpy(bufDevice, fileSize, bufHost, fileSize, ACL_MEMCPY_HOST_TO_DEVICE); + + // Run inplace kernel (src == dst = bufDevice) + // Note: launch wrapper takes two args but inplace kernel uses same physical address + tc.launch((float *)bufDevice, (float *)bufDevice, stream); + + aclrtSynchronizeStream(stream); + // Copy result back (same buffer contains output) + aclrtMemcpy(bufHost, fileSize, bufDevice, fileSize, ACL_MEMCPY_DEVICE_TO_HOST); + } + + if (rc == 0 && !WriteFile((caseDir + "/output.bin").c_str(), bufHost, fileSize)) { + std::fprintf(stderr, "[ERROR] failed to write %s/output.bin\n", caseDir.c_str()); + rc = 1; + } + + if (bufDevice != nullptr) + aclrtFree(bufDevice); + if (bufHost != nullptr) + aclrtFreeHost(bufHost); + + if (rc == 0) + std::printf("[INFO] case %s done\n", tc.name); + return rc; +} + +int main(int argc, char *argv[]) { + const char *caseFilter = (argc > 1) ? argv[1] : nullptr; + + int rc = 0; + int deviceId = 0; + aclrtStream stream = nullptr; + + aclInit(nullptr); + if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { + deviceId = std::atoi(envDevice); + } + aclrtSetDevice(deviceId); + aclrtCreateStream(&stream); + + for (size_t i = 0; i < kNumCases; ++i) { + if (caseFilter != nullptr && std::strcmp(kCases[i].name, caseFilter) != 0) { + continue; + } + int ret = RunCase(kCases[i], deviceId, stream); + if (ret != 0) { + std::fprintf(stderr, "[ERROR] case %s failed\n", kCases[i].name); + rc = 1; + break; + } + } + + if (stream != nullptr) + aclrtDestroyStream(stream); + aclrtResetDevice(deviceId); + aclFinalize(); + + return rc; +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tfillpad_inplace/tfillpad_inplace.pto b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tfillpad_inplace/tfillpad_inplace.pto new file mode 100644 index 000000000..0d80ee559 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tfillpad_inplace/tfillpad_inplace.pto @@ -0,0 +1,115 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You can not use this file in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// TileLang ST kernels for pto.tfillpad (inplace mode). +// Matches C++ reference test case: Case 5 +// Compiled by ptoas --enable-insert-sync --enable-tile-op-expand --pto-backend=vpto +// +// PadValue encoding: 0=Null, 1=Zero, 2=Max, 3=Min +// Case 5: float, 260x16, valid=260x7, FillPad=Max (pad=2) +// +// Note: PTOAS tstore requires dst size to match src valid_shape. +// For outputting full buffer after inplace fill, we use two tiles: +// - src tile: holds input data (valid=260x7) +// - dst tile: receives filled data (valid=260x16 for output) + +module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { + // ========== Smoke case: float, 64x16 physical, src_valid == dst_valid ========== + + func.func @TFILLPAD_INPLACE_f32_64x16_noexpand(%tile_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c0_i64 = arith.constant 0 : i64 + %c1 = arith.constant 1 : index + %c16 = arith.constant 16 : index + %c64 = arith.constant 64 : index + %c1024 = arith.constant 1024 : index + + %src_view = pto.make_tensor_view %tile_ptr, + shape = [%c1, %c1, %c1, %c64, %c16], + strides = [%c1024, %c1024, %c1024, %c16, %c1] + : !pto.tensor_view<1x1x1x64x16xf32> + + %dst_view = pto.make_tensor_view %tile_ptr, + shape = [%c1, %c1, %c1, %c64, %c16], + strides = [%c1024, %c1024, %c1024, %c16, %c1] + : !pto.tensor_view<1x1x1x64x16xf32> + + %src_part = pto.partition_view %src_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c64, %c16] + : !pto.tensor_view<1x1x1x64x16xf32> -> !pto.partition_tensor_view<1x1x1x64x16xf32> + %dst_part = pto.partition_view %dst_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c64, %c16] + : !pto.tensor_view<1x1x1x64x16xf32> -> !pto.partition_tensor_view<1x1x1x64x16xf32> + + %tile_buf = pto.alloc_tile addr = %c0_i64 + : !pto.tile_buf + + pto.tload ins(%src_part : !pto.partition_tensor_view<1x1x1x64x16xf32>) + outs(%tile_buf : !pto.tile_buf) + + pto.tfillpad_inplace ins(%tile_buf : !pto.tile_buf) + outs(%tile_buf : !pto.tile_buf) + + pto.tstore ins(%tile_buf : !pto.tile_buf) + outs(%dst_part : !pto.partition_tensor_view<1x1x1x64x16xf32>) + return + } + + // ========== No expansion: float, 260x16 physical, src_valid == dst_valid ========== + + + func.func @TFILLPAD_INPLACE_f32_260x16_noexpand(%tile_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c0_i64 = arith.constant 0 : i64 + %c1 = arith.constant 1 : index + %c16 = arith.constant 16 : index + %c260 = arith.constant 260 : index + %c4160 = arith.constant 4160 : index // 260*16 (full tile size) + + // Input tensor_view: 260x16 + %src_view = pto.make_tensor_view %tile_ptr, + shape = [%c1, %c1, %c1, %c260, %c16], + strides = [%c4160, %c4160, %c4160, %c16, %c1] + : !pto.tensor_view<1x1x1x260x16xf32> + + // Output tensor_view: 260x16 (same as input) + %dst_view = pto.make_tensor_view %tile_ptr, + shape = [%c1, %c1, %c1, %c260, %c16], + strides = [%c4160, %c4160, %c4160, %c16, %c1] + : !pto.tensor_view<1x1x1x260x16xf32> + + %src_part = pto.partition_view %src_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c260, %c16] + : !pto.tensor_view<1x1x1x260x16xf32> -> !pto.partition_tensor_view<1x1x1x260x16xf32> + %dst_part = pto.partition_view %dst_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c260, %c16] + : !pto.tensor_view<1x1x1x260x16xf32> -> !pto.partition_tensor_view<1x1x1x260x16xf32> + + // Single tile buffer in UB space at address 0 + // src_valid = dst_valid = 260x16, so no expansion needed + %tile_buf = pto.alloc_tile addr = %c0_i64 + : !pto.tile_buf + + // Load full tile (260x16) + pto.tload ins(%src_part : !pto.partition_tensor_view<1x1x1x260x16xf32>) + outs(%tile_buf : !pto.tile_buf) + + // tfillpad_inplace: src_valid == dst_valid, no expansion + pto.tfillpad_inplace ins(%tile_buf : !pto.tile_buf) + outs(%tile_buf : !pto.tile_buf) + + // Store full tile + pto.tstore ins(%tile_buf : !pto.tile_buf) + outs(%dst_part : !pto.partition_tensor_view<1x1x1x260x16xf32>) + return + } +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tfmod/CMakeLists.txt b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tfmod/CMakeLists.txt new file mode 100644 index 000000000..4ba86b47c --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tfmod/CMakeLists.txt @@ -0,0 +1,9 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +pto_tilelang_vec_st(tfmod) diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tfmod/cases.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tfmod/cases.py new file mode 100644 index 000000000..6e61a42f2 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tfmod/cases.py @@ -0,0 +1,47 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +"""Single source of truth for tfmod ST test cases. + +Each case defines: + - name: case identifier, used as subdirectory name and by main.cpp kCases[]. + - dtype: numpy dtype (e.g. np.float32). + - shape: (rows, cols) — allocated tile dimensions. + - valid_shape: (valid_rows, valid_cols) — effective computation region. + - eps: tolerance for numpy.allclose (atol and rtol). + +gen_data.py and compare.py both import this list to avoid redundant definitions. +""" + +import numpy as np + +CASES = [ + { + "name": "f32_16x64", + "dtype": np.float32, + "shape": (16, 64), + "valid_shape": (16, 64), + "eps": 1e-6, + }, + { + "name": "f32_32x32", + "dtype": np.float32, + "shape": (32, 32), + "valid_shape": (32, 32), + "eps": 1e-6, + }, +] + +_SMOKE_CASE_NAMES = ['f32_16x64', 'f32_32x32'] +_SMOKE_CASE_NAME_SET = set(_SMOKE_CASE_NAMES) +_missing = [name for name in _SMOKE_CASE_NAMES if name not in {case["name"] for case in CASES}] +if _missing: + raise RuntimeError("unknown smoke case(s): " + ", ".join(_missing)) +CASES = [case for case in CASES if case["name"] in _SMOKE_CASE_NAME_SET] diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tfmod/compare.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tfmod/compare.py new file mode 100644 index 000000000..6a4d5d1aa --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tfmod/compare.py @@ -0,0 +1,48 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import os +import sys +import numpy as np + +from cases import CASES +from st_common import result_cmp, style_fail, style_pass, validate_cases + + +def main(): + validate_cases(CASES) + case_filter = sys.argv[1] if len(sys.argv) > 1 else None + + all_passed = True + for case in CASES: + if case_filter is not None and case["name"] != case_filter: + continue + + case_dir = case["name"] + shape = case["shape"] + vr, vc = case["valid_shape"] + + golden = np.fromfile(os.path.join(case_dir, "golden.bin"), dtype=case["dtype"]).reshape(shape) + output = np.fromfile(os.path.join(case_dir, "output.bin"), dtype=case["dtype"]).reshape(shape) + + ok = result_cmp(golden[:vr, :vc], output[:vr, :vc], case["eps"]) + if ok: + print(style_pass(f"[INFO] {case['name']}: compare passed")) + else: + print(style_fail(f"[ERROR] {case['name']}: compare failed")) + all_passed = False + + if not all_passed: + sys.exit(2) + print(style_pass("[INFO] all cases passed")) + + +if __name__ == "__main__": + main() diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tfmod/gen_data.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tfmod/gen_data.py new file mode 100644 index 000000000..12725e041 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tfmod/gen_data.py @@ -0,0 +1,32 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import numpy as np +from cases import CASES +from st_common import validate_cases, setup_case_rng, save_case_data + +validate_cases(CASES) + +for case in CASES: + setup_case_rng(case) + + dtype = case["dtype"] + shape = case["shape"] + valid_shape = case["valid_shape"] + + input1 = np.random.randint(1, 10, size=shape).astype(dtype) + input2 = np.random.randint(1, 10, size=shape).astype(dtype) + + golden = np.zeros(shape, dtype=dtype) + vr, vc = valid_shape + golden[:vr, :vc] = np.fmod(input1[:vr, :vc], input2[:vr, :vc]) + + save_case_data(case["name"], {"input1": input1, "input2": input2, "golden": golden}) + print(f"[INFO] gen_data: {case['name']} shape={shape} valid_shape={valid_shape} dtype={dtype.__name__}") diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tfmod/launch.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tfmod/launch.cpp new file mode 100644 index 000000000..801b7521e --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tfmod/launch.cpp @@ -0,0 +1,28 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +#include + +#ifndef AICORE +#define AICORE [aicore] +#endif + +// Case 0: f32 16x64 + +extern "C" __global__ AICORE void TFMOD_f32_16x64(__gm__ float *a, __gm__ float *b, __gm__ float *c); +extern "C" __global__ AICORE void TFMOD_f32_32x32(__gm__ float *a, __gm__ float *b, __gm__ float *c); + +void LaunchTFMOD_f32_16x64(float *a, float *b, float *c, void *stream) { + TFMOD_f32_16x64<<<1, nullptr, stream>>>((__gm__ float *)a, (__gm__ float *)b, (__gm__ float *)c); +} + + + +void LaunchTFMOD_f32_32x32(float *a, float *b, float *c, void *stream) { + TFMOD_f32_32x32<<<1, nullptr, stream>>>((__gm__ float *)a, (__gm__ float *)b, (__gm__ float *)c); +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tfmod/main.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tfmod/main.cpp new file mode 100644 index 000000000..8659e8dc9 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tfmod/main.cpp @@ -0,0 +1,145 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// Host driver for TileLang tadd ST — case-table driven. +// Each case launches a different kernel variant, reads/writes from per-case subdirectory. +// Numerical comparison is done externally by compare.py. + +#include "acl/acl.h" +#include "test_common.h" +#include +#include +#include +#include +#include +#include + +using namespace PtoTestCommon; + +// Kernel launch wrappers (defined in launch.cpp) +void LaunchTFMOD_f32_16x64(float *a, float *b, float *c, void *stream); +void LaunchTFMOD_f32_32x32(float *a, float *b, float *c, void *stream); + +using LaunchFn = void (*)(float *, float *, float *, void *); + +struct TestCase { + const char *name; + LaunchFn launch; + size_t rows; // allocated tile rows + size_t cols; // allocated tile cols + size_t validRows; // effective computation rows (<= rows) + size_t validCols; // effective computation cols (<= cols) + size_t elemSize; // bytes per element +}; + +static const TestCase kCases[] = { +{"f32_16x64", LaunchTFMOD_f32_16x64, 16, 64, 16, 64, sizeof(float)}, +{"f32_32x32", LaunchTFMOD_f32_32x32, 32, 32, 32, 32, sizeof(float)}, +}; +static constexpr size_t kNumCases = sizeof(kCases) / sizeof(kCases[0]); + +static int RunCase(const TestCase &tc, int deviceId, aclrtStream stream) { + int rc = 0; + const size_t elemCount = tc.rows * tc.cols; + const size_t fileSize = elemCount * tc.elemSize; + + std::printf("[INFO] === case: %s (shape=%zux%zu, valid=%zux%zu) ===\n", + tc.name, tc.rows, tc.cols, tc.validRows, tc.validCols); + + // Per-case data directory + std::string caseDir = std::string("./") + tc.name; + size_t src0FileSize = fileSize; + size_t src1FileSize = fileSize; + + float *src0Host = nullptr, *src1Host = nullptr, *dstHost = nullptr; + float *src0Device = nullptr, *src1Device = nullptr, *dstDevice = nullptr; + + aclrtMallocHost((void **)(&src0Host), fileSize); + aclrtMallocHost((void **)(&src1Host), fileSize); + aclrtMallocHost((void **)(&dstHost), fileSize); + + aclrtMalloc((void **)&src0Device, fileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc((void **)&src1Device, fileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc((void **)&dstDevice, fileSize, ACL_MEM_MALLOC_HUGE_FIRST); + + if (!ReadFile((caseDir + "/input1.bin").c_str(), src0FileSize, src0Host, fileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input1.bin\n", caseDir.c_str()); + rc = 1; + } + if (rc == 0 && !ReadFile((caseDir + "/input2.bin").c_str(), src1FileSize, src1Host, fileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input2.bin\n", caseDir.c_str()); + rc = 1; + } + + if (rc == 0) { + aclrtMemcpy(src0Device, fileSize, src0Host, fileSize, ACL_MEMCPY_HOST_TO_DEVICE); + aclrtMemcpy(src1Device, fileSize, src1Host, fileSize, ACL_MEMCPY_HOST_TO_DEVICE); + + tc.launch(src0Device, src1Device, dstDevice, stream); + + aclrtSynchronizeStream(stream); + aclrtMemcpy(dstHost, fileSize, dstDevice, fileSize, ACL_MEMCPY_DEVICE_TO_HOST); + } + + if (rc == 0 && !WriteFile((caseDir + "/output.bin").c_str(), dstHost, fileSize)) { + std::fprintf(stderr, "[ERROR] failed to write %s/output.bin\n", caseDir.c_str()); + rc = 1; + } + + if (src0Device != nullptr) + aclrtFree(src0Device); + if (src1Device != nullptr) + aclrtFree(src1Device); + if (dstDevice != nullptr) + aclrtFree(dstDevice); + if (src0Host != nullptr) + aclrtFreeHost(src0Host); + if (src1Host != nullptr) + aclrtFreeHost(src1Host); + if (dstHost != nullptr) + aclrtFreeHost(dstHost); + + if (rc == 0) + std::printf("[INFO] case %s done\n", tc.name); + return rc; +} + +int main(int argc, char *argv[]) { + // Optional case filter: ./tfmod [case_name] + const char *caseFilter = (argc > 1) ? argv[1] : nullptr; + + int rc = 0; + int deviceId = 0; + aclrtStream stream = nullptr; + + aclInit(nullptr); + if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { + deviceId = std::atoi(envDevice); + } + aclrtSetDevice(deviceId); + aclrtCreateStream(&stream); + + for (size_t i = 0; i < kNumCases; ++i) { + if (caseFilter != nullptr && std::strcmp(kCases[i].name, caseFilter) != 0) { + continue; + } + int ret = RunCase(kCases[i], deviceId, stream); + if (ret != 0) { + std::fprintf(stderr, "[ERROR] case %s failed\n", kCases[i].name); + rc = 1; + break; + } + } + + if (stream != nullptr) + aclrtDestroyStream(stream); + aclrtResetDevice(deviceId); + aclFinalize(); + + return rc; +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tfmod/tfmod.pto b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tfmod/tfmod.pto new file mode 100644 index 000000000..c3b3260d8 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tfmod/tfmod.pto @@ -0,0 +1,142 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// TileLang ST kernels for pto.tadd: tload(a) + tload(b) + tadd(a,b)->c + tstore(c). +// Multiple cases with different shapes in a single module. +// Compiled by ptoas --enable-insert-sync --enable-tile-op-expand --pto-backend=vpto +// to produce a fatobj object. + +module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { + // Case 0: f32 16x64 (1024 elements) + func.func @TFMOD_f32_16x64(%a_ptr: !pto.ptr, %b_ptr: !pto.ptr, %c_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c16 = arith.constant 16 : index + %c64 = arith.constant 64 : index + %c1024 = arith.constant 1024 : index + + %a_view = pto.make_tensor_view %a_ptr, + shape = [%c1, %c1, %c1, %c16, %c64], + strides = [%c1024, %c1024, %c1024, %c64, %c1] + : !pto.tensor_view<1x1x1x16x64xf32> + %b_view = pto.make_tensor_view %b_ptr, + shape = [%c1, %c1, %c1, %c16, %c64], + strides = [%c1024, %c1024, %c1024, %c64, %c1] + : !pto.tensor_view<1x1x1x16x64xf32> + %c_view = pto.make_tensor_view %c_ptr, + shape = [%c1, %c1, %c1, %c16, %c64], + strides = [%c1024, %c1024, %c1024, %c64, %c1] + : !pto.tensor_view<1x1x1x16x64xf32> + + %a_part = pto.partition_view %a_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c16, %c64] + : !pto.tensor_view<1x1x1x16x64xf32> -> !pto.partition_tensor_view<1x1x1x16x64xf32> + %b_part = pto.partition_view %b_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c16, %c64] + : !pto.tensor_view<1x1x1x16x64xf32> -> !pto.partition_tensor_view<1x1x1x16x64xf32> + %c_part = pto.partition_view %c_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c16, %c64] + : !pto.tensor_view<1x1x1x16x64xf32> -> !pto.partition_tensor_view<1x1x1x16x64xf32> + + %a = pto.alloc_tile + : !pto.tile_buf + %b = pto.alloc_tile + : !pto.tile_buf + %c = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%a_part : !pto.partition_tensor_view<1x1x1x16x64xf32>) + outs(%a : !pto.tile_buf) + pto.tload ins(%b_part : !pto.partition_tensor_view<1x1x1x16x64xf32>) + outs(%b : !pto.tile_buf) + + pto.tfmod ins(%a, %b : !pto.tile_buf, + !pto.tile_buf) + outs(%c : !pto.tile_buf) + + pto.tstore ins(%c : !pto.tile_buf) + outs(%c_part : !pto.partition_tensor_view<1x1x1x16x64xf32>) + return + } + + // Case 1: f32 32x32 (1024 elements) + + func.func @TFMOD_f32_32x32(%a_ptr: !pto.ptr, %b_ptr: !pto.ptr, %c_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c32 = arith.constant 32 : index + %c1024 = arith.constant 1024 : index + + %a_view = pto.make_tensor_view %a_ptr, + shape = [%c1, %c1, %c1, %c32, %c32], + strides = [%c1024, %c1024, %c1024, %c32, %c1] + : !pto.tensor_view<1x1x1x32x32xf32> + %b_view = pto.make_tensor_view %b_ptr, + shape = [%c1, %c1, %c1, %c32, %c32], + strides = [%c1024, %c1024, %c1024, %c32, %c1] + : !pto.tensor_view<1x1x1x32x32xf32> + %c_view = pto.make_tensor_view %c_ptr, + shape = [%c1, %c1, %c1, %c32, %c32], + strides = [%c1024, %c1024, %c1024, %c32, %c1] + : !pto.tensor_view<1x1x1x32x32xf32> + + %a_part = pto.partition_view %a_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c32, %c32] + : !pto.tensor_view<1x1x1x32x32xf32> -> !pto.partition_tensor_view<1x1x1x32x32xf32> + %b_part = pto.partition_view %b_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c32, %c32] + : !pto.tensor_view<1x1x1x32x32xf32> -> !pto.partition_tensor_view<1x1x1x32x32xf32> + %c_part = pto.partition_view %c_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c32, %c32] + : !pto.tensor_view<1x1x1x32x32xf32> -> !pto.partition_tensor_view<1x1x1x32x32xf32> + + %a = pto.alloc_tile + : !pto.tile_buf + %b = pto.alloc_tile + : !pto.tile_buf + %c = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%a_part : !pto.partition_tensor_view<1x1x1x32x32xf32>) + outs(%a : !pto.tile_buf) + pto.tload ins(%b_part : !pto.partition_tensor_view<1x1x1x32x32xf32>) + outs(%b : !pto.tile_buf) + + pto.tfmod ins(%a, %b : !pto.tile_buf, + !pto.tile_buf) + outs(%c : !pto.tile_buf) + + pto.tstore ins(%c : !pto.tile_buf) + outs(%c_part : !pto.partition_tensor_view<1x1x1x32x32xf32>) + return + } +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tfmods/CMakeLists.txt b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tfmods/CMakeLists.txt new file mode 100644 index 000000000..0d47eae66 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tfmods/CMakeLists.txt @@ -0,0 +1,9 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +pto_tilelang_vec_st(tfmods) diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tfmods/cases.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tfmods/cases.py new file mode 100644 index 000000000..8633b5888 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tfmods/cases.py @@ -0,0 +1,63 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +"""Single source of truth for tfmods ST test cases. + +tfmods: floating-point modulo, dst = src - trunc(src/scalar) * scalar +Only f32 and f16 types. +""" + +import numpy as np + +CASES = [ + { + "name": "f32_32x64", + "dtype": np.float32, + "shape": (32, 64), + "valid_shape": (32, 64), + "eps": 1e-6, + }, + { + "name": "f16_63x64", + "dtype": np.float16, + "shape": (63, 64), + "valid_shape": (63, 64), + "eps": 1e-3, + }, + { + "name": "f32_7x448", + "dtype": np.float32, + "shape": (7, 448), + "valid_shape": (7, 448), + "eps": 1e-6, + }, + { + "name": "f32_256x16", + "dtype": np.float32, + "shape": (256, 16), + "valid_shape": (256, 16), + "eps": 1e-6, + }, +] + +_SMOKE_CASE_NAMES = ['f32_32x64', 'f16_63x64'] +_SMOKE_CASE_NAME_SET = set(_SMOKE_CASE_NAMES) +_missing = [name for name in _SMOKE_CASE_NAMES if name not in {case["name"] for case in CASES}] +if _missing: + raise RuntimeError("unknown smoke case(s): " + ", ".join(_missing)) +CASES = [case for case in CASES if case["name"] in _SMOKE_CASE_NAME_SET] diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tfmods/compare.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tfmods/compare.py new file mode 100644 index 000000000..18835ae9f --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tfmods/compare.py @@ -0,0 +1,56 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import os +import sys +import numpy as np + +from cases import CASES +from st_common import result_cmp, style_fail, style_pass, validate_cases + + +def main(): + validate_cases(CASES) + case_filter = sys.argv[1] if len(sys.argv) > 1 else None + + all_passed = True + for case in CASES: + if case_filter is not None and case["name"] != case_filter: + continue + + case_dir = case["name"] + shape = case["shape"] + vr, vc = case["valid_shape"] + + golden = np.fromfile(os.path.join(case_dir, "golden.bin"), dtype=case["dtype"]).reshape(shape) + output = np.fromfile(os.path.join(case_dir, "output.bin"), dtype=case["dtype"]).reshape(shape) + + ok = result_cmp(golden[:vr, :vc], output[:vr, :vc], case["eps"]) + if ok: + print(style_pass(f"[INFO] {case['name']}: compare passed")) + else: + print(style_fail(f"[ERROR] {case['name']}: compare failed")) + all_passed = False + + if not all_passed: + sys.exit(2) + print(style_pass("[INFO] all cases passed")) + + +if __name__ == "__main__": + main() diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tfmods/gen_data.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tfmods/gen_data.py new file mode 100644 index 000000000..8e72da579 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tfmods/gen_data.py @@ -0,0 +1,43 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import numpy as np +from cases import CASES +from st_common import validate_cases, setup_case_rng, save_case_data + +# Scalar value for floating-point modulo (matches the scalar passed in launch.cpp) +SCALAR = 3.0 + +validate_cases(CASES) + +for case in CASES: + setup_case_rng(case) + + dtype = case["dtype"] + shape = case["shape"] + valid_shape = case["valid_shape"] + + input1 = np.random.randint(1, 10, size=shape).astype(dtype) + + golden = np.zeros(shape, dtype=dtype) + vr, vc = valid_shape + scalar_val = dtype(SCALAR) + golden[:vr, :vc] = np.fmod(input1[:vr, :vc], scalar_val).astype(dtype, copy=False) + + save_case_data(case["name"], {"input1": input1, "golden": golden}) + print(f"[INFO] gen_data: {case['name']} shape={shape} valid_shape={valid_shape} dtype={dtype.__name__} scalar={SCALAR}") diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tfmods/launch.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tfmods/launch.cpp new file mode 100644 index 000000000..a88a1b0b0 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tfmods/launch.cpp @@ -0,0 +1,31 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +#include + +#ifndef AICORE +#define AICORE [aicore] +#endif + +// Scalar value for floating-point modulo (must match gen_data.py SCALAR) +static constexpr float TFMODS_SCALAR_F32 = 3.0f; + +// Case 0: f32 32x64 + +extern "C" __global__ AICORE void TFMODS_f32_32x64(__gm__ float *src, __gm__ float *dst, float scalar); +extern "C" __global__ AICORE void TFMODS_f16_63x64(__gm__ unsigned short *src, __gm__ unsigned short *dst, unsigned short scalar); + +void LaunchTFMODS_f16_63x64(unsigned short *src, unsigned short *dst, void *stream) { + TFMODS_f16_63x64<<<1, nullptr, stream>>>((__gm__ unsigned short *)src, (__gm__ unsigned short *)dst, (unsigned short)0x4200); +} + + + +void LaunchTFMODS_f32_32x64(float *src, float *dst, void *stream) { + TFMODS_f32_32x64<<<1, nullptr, stream>>>((__gm__ float *)src, (__gm__ float *)dst, TFMODS_SCALAR_F32); +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tfmods/main.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tfmods/main.cpp new file mode 100644 index 000000000..6162c9abc --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tfmods/main.cpp @@ -0,0 +1,132 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// Host driver for TileLang tfmods ST — case-table driven. +// tfmods: dst = src - trunc(src/scalar) * scalar (floating-point modulo). +// Numerical comparison is done externally by compare.py. + +#include "acl/acl.h" +#include "test_common.h" +#include +#include +#include +#include +#include +#include + +using namespace PtoTestCommon; + +// Kernel launch wrappers (defined in launch.cpp) +void LaunchTFMODS_f32_32x64(float *src, float *dst, void *stream); +void LaunchTFMODS_f16_63x64(uint16_t *src, uint16_t *dst, void *stream); +void LaunchTFMODS_f32_256x16(float *src, float *dst, void *stream); + +struct TestCase { + const char *name; + void (*launch)(void *, void *, void *); // src, dst, stream + size_t rows; // allocated tile rows + size_t cols; // allocated tile cols + size_t validRows; // effective computation rows (<= rows) + size_t validCols; // effective computation cols (<= cols) + size_t elemSize; // bytes per element +}; + +static const TestCase kCases[] = { +{"f32_32x64", (void (*)(void*,void*,void*))LaunchTFMODS_f32_32x64, 32, 64, 32, 64, sizeof(float)}, +{"f16_63x64", (void (*)(void*,void*,void*))LaunchTFMODS_f16_63x64, 63, 64, 63, 64, sizeof(uint16_t)}, +}; +static constexpr size_t kNumCases = sizeof(kCases) / sizeof(kCases[0]); + +static int RunCase(const TestCase &tc, int deviceId, aclrtStream stream) { + int rc = 0; + const size_t elemCount = tc.rows * tc.cols; + const size_t fileSize = elemCount * tc.elemSize; + + std::printf("[INFO] === case: %s (shape=%zux%zu, valid=%zux%zu) ===\n", + tc.name, tc.rows, tc.cols, tc.validRows, tc.validCols); + + // Per-case data directory + std::string caseDir = std::string("./") + tc.name; + size_t srcFileSize = fileSize; + + void *srcHost = nullptr, *dstHost = nullptr; + void *srcDevice = nullptr, *dstDevice = nullptr; + + aclrtMallocHost(&srcHost, fileSize); + aclrtMallocHost(&dstHost, fileSize); + + aclrtMalloc(&srcDevice, fileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc(&dstDevice, fileSize, ACL_MEM_MALLOC_HUGE_FIRST); + + if (!ReadFile((caseDir + "/input1.bin").c_str(), srcFileSize, srcHost, fileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input1.bin\n", caseDir.c_str()); + rc = 1; + } + + if (rc == 0) { + aclrtMemcpy(srcDevice, fileSize, srcHost, fileSize, ACL_MEMCPY_HOST_TO_DEVICE); + + tc.launch(srcDevice, dstDevice, stream); + + aclrtSynchronizeStream(stream); + aclrtMemcpy(dstHost, fileSize, dstDevice, fileSize, ACL_MEMCPY_DEVICE_TO_HOST); + } + + if (rc == 0 && !WriteFile((caseDir + "/output.bin").c_str(), dstHost, fileSize)) { + std::fprintf(stderr, "[ERROR] failed to write %s/output.bin\n", caseDir.c_str()); + rc = 1; + } + + if (srcDevice != nullptr) + aclrtFree(srcDevice); + if (dstDevice != nullptr) + aclrtFree(dstDevice); + if (srcHost != nullptr) + aclrtFreeHost(srcHost); + if (dstHost != nullptr) + aclrtFreeHost(dstHost); + + if (rc == 0) + std::printf("[INFO] case %s done\n", tc.name); + return rc; +} + +int main(int argc, char *argv[]) { + // Optional case filter: ./tfmods [case_name] + const char *caseFilter = (argc > 1) ? argv[1] : nullptr; + + int rc = 0; + int deviceId = 0; + aclrtStream stream = nullptr; + + aclInit(nullptr); + if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { + deviceId = std::atoi(envDevice); + } + aclrtSetDevice(deviceId); + aclrtCreateStream(&stream); + + for (size_t i = 0; i < kNumCases; ++i) { + if (caseFilter != nullptr && std::strcmp(kCases[i].name, caseFilter) != 0) { + continue; + } + int ret = RunCase(kCases[i], deviceId, stream); + if (ret != 0) { + std::fprintf(stderr, "[ERROR] case %s failed\n", kCases[i].name); + rc = 1; + break; + } + } + + if (stream != nullptr) + aclrtDestroyStream(stream); + aclrtResetDevice(deviceId); + aclFinalize(); + + return rc; +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tfmods/tfmods.pto b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tfmods/tfmods.pto new file mode 100644 index 000000000..6153e4489 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tfmods/tfmods.pto @@ -0,0 +1,99 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// TileLang ST kernels for pto.tfmods: tload(src) + tfmods(src, scalar)->dst + tstore(dst). +// Floating-point modulo: dst = src - trunc(src/scalar) * scalar. +// Only f32 and f16 types. +// Compiled by ptoas --enable-insert-sync --enable-tile-op-expand --pto-backend=vpto +// to produce a fatobj object. + +module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { + + // Case 0: f32 32x64 (2048 elements) + func.func @TFMODS_f32_32x64(%src_ptr: !pto.ptr, %dst_ptr: !pto.ptr, %scalar: f32) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c32 = arith.constant 32 : index + %c64 = arith.constant 64 : index + %c2048 = arith.constant 2048 : index + + %src_view = pto.make_tensor_view %src_ptr, + shape = [%c1, %c1, %c1, %c32, %c64], + strides = [%c2048, %c2048, %c2048, %c64, %c1] + : !pto.tensor_view<1x1x1x32x64xf32> + %dst_view = pto.make_tensor_view %dst_ptr, + shape = [%c1, %c1, %c1, %c32, %c64], + strides = [%c2048, %c2048, %c2048, %c64, %c1] + : !pto.tensor_view<1x1x1x32x64xf32> + + %src_part = pto.partition_view %src_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c32, %c64] + : !pto.tensor_view<1x1x1x32x64xf32> -> !pto.partition_tensor_view<1x1x1x32x64xf32> + %dst_part = pto.partition_view %dst_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c32, %c64] + : !pto.tensor_view<1x1x1x32x64xf32> -> !pto.partition_tensor_view<1x1x1x32x64xf32> + + %src = pto.alloc_tile + : !pto.tile_buf + %dst = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%src_part : !pto.partition_tensor_view<1x1x1x32x64xf32>) + outs(%src : !pto.tile_buf) + pto.tfmods ins(%src, %scalar : !pto.tile_buf, f32) + outs(%dst : !pto.tile_buf) + pto.tstore ins(%dst : !pto.tile_buf) + outs(%dst_part : !pto.partition_tensor_view<1x1x1x32x64xf32>) + return + } + + // Case 1: f16 63x64 (4032 elements) + + func.func @TFMODS_f16_63x64(%src_ptr: !pto.ptr, %dst_ptr: !pto.ptr, %scalar: f16) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c63 = arith.constant 63 : index + %c64 = arith.constant 64 : index + %c4032 = arith.constant 4032 : index + + %src_view = pto.make_tensor_view %src_ptr, + shape = [%c1, %c1, %c1, %c63, %c64], + strides = [%c4032, %c4032, %c4032, %c64, %c1] + : !pto.tensor_view<1x1x1x63x64xf16> + %dst_view = pto.make_tensor_view %dst_ptr, + shape = [%c1, %c1, %c1, %c63, %c64], + strides = [%c4032, %c4032, %c4032, %c64, %c1] + : !pto.tensor_view<1x1x1x63x64xf16> + + %src_part = pto.partition_view %src_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c63, %c64] + : !pto.tensor_view<1x1x1x63x64xf16> -> !pto.partition_tensor_view<1x1x1x63x64xf16> + %dst_part = pto.partition_view %dst_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c63, %c64] + : !pto.tensor_view<1x1x1x63x64xf16> -> !pto.partition_tensor_view<1x1x1x63x64xf16> + + %src = pto.alloc_tile + : !pto.tile_buf + %dst = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%src_part : !pto.partition_tensor_view<1x1x1x63x64xf16>) + outs(%src : !pto.tile_buf) + pto.tfmods ins(%src, %scalar : !pto.tile_buf, f16) + outs(%dst : !pto.tile_buf) + pto.tstore ins(%dst : !pto.tile_buf) + outs(%dst_part : !pto.partition_tensor_view<1x1x1x63x64xf16>) + return + } + + // Case 2: f32 7x448 (3136 elements) +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tload/CMakeLists.txt b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tload/CMakeLists.txt new file mode 100644 index 000000000..a4ef685a5 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tload/CMakeLists.txt @@ -0,0 +1,9 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +pto_tilelang_vec_st(tload) diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tload/cases.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tload/cases.py new file mode 100644 index 000000000..985e87616 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tload/cases.py @@ -0,0 +1,130 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +import numpy as np + +CASES = [ + { + "name": "nd_f32_16x64", + "dtype": np.float32, + "shape": (16, 64), + "valid_shape": (16, 64), + "eps": 1e-6, + }, + { + "name": "dn_f32_16x64", + "dtype": np.float32, + "shape": (16, 64), + "valid_shape": (16, 64), + "eps": 1e-6, + }, + { + "name": "nz_f32_128x128", + "dtype": np.float32, + "shape": (128, 128), + "valid_shape": (128, 128), + "eps": 1e-6, + }, + { + "name": "nd_pad_zero_f32_16x64", + "dtype": np.float32, + "shape": (16, 64), + "valid_shape": (16, 63), + "eps": 1e-6, + "golden_fill": 0.0, + }, + { + "name": "dn_pad_max_f32_16x64", + "dtype": np.float32, + "shape": (16, 64), + "valid_shape": (15, 64), + "eps": 1e-6, + "golden_fill": np.finfo(np.float32).max, + }, + { + "name": "nz_pad_min_f32_128x128", + "dtype": np.float32, + "shape": (128, 128), + "valid_shape": (64, 128), + "eps": 1e-6, + "golden_fill": np.finfo(np.float32).min, + }, +] + + +def build_expected_output(case, input_arr): + shape = case["shape"] + vr, vc = case["valid_shape"] + dtype = case["dtype"] + + if "golden_fill" in case: + golden = np.full(shape, case["golden_fill"], dtype=dtype) + else: + golden = np.empty(shape, dtype=dtype) + + if case["name"].startswith("dn_pad_"): + flat_in = np.asarray(input_arr, dtype=dtype).reshape(-1) + flat_golden = golden.reshape(-1) + physical_rows = shape[0] + for col in range(vc): + start = physical_rows * col + flat_golden[start : start + vr] = flat_in[start : start + vr] + return golden + + if case["name"].startswith("nz_pad_"): + flat_in = np.asarray(input_arr, dtype=dtype).reshape(-1) + flat_golden = golden.reshape(-1) + block_rows = 8 + block_size = block_rows * shape[1] + num_blocks = shape[0] // block_rows + valid_rows_per_block = vr // num_blocks + for block in range(num_blocks): + base = block * block_size + valid_elems = valid_rows_per_block * shape[1] + flat_golden[base : base + valid_elems] = flat_in[base : base + valid_elems] + return golden + + if "golden_fill" in case: + golden[:vr, :vc] = input_arr[:vr, :vc] + return golden + + return np.asarray(input_arr, dtype=dtype).copy() + + +def select_compared_region(case, arr): + vr, vc = case["valid_shape"] + + if case["name"].startswith("dn_pad_"): + flat = np.asarray(arr).reshape(-1) + physical_rows = case["shape"][0] + pieces = [flat[physical_rows * col : physical_rows * col + vr] for col in range(vc)] + return np.concatenate(pieces) if pieces else flat[:0] + + if case["name"].startswith("nz_pad_"): + flat = np.asarray(arr).reshape(-1) + shape = case["shape"] + block_rows = 8 + block_size = block_rows * shape[1] + num_blocks = shape[0] // block_rows + valid_rows_per_block = vr // num_blocks + pieces = [] + for block in range(num_blocks): + base = block * block_size + valid_elems = valid_rows_per_block * shape[1] + pieces.append(flat[base : base + valid_elems]) + return np.concatenate(pieces) if pieces else flat[:0] + + return np.asarray(arr)[:vr, :vc] + +_SMOKE_CASE_NAMES = ['nd_f32_16x64', 'dn_pad_max_f32_16x64'] +_SMOKE_CASE_NAME_SET = set(_SMOKE_CASE_NAMES) +_missing = [name for name in _SMOKE_CASE_NAMES if name not in {case["name"] for case in CASES}] +if _missing: + raise RuntimeError("unknown smoke case(s): " + ", ".join(_missing)) +CASES = [case for case in CASES if case["name"] in _SMOKE_CASE_NAME_SET] diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tload/compare.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tload/compare.py new file mode 100644 index 000000000..6adc9c9fe --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tload/compare.py @@ -0,0 +1,50 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +import os +import sys +import numpy as np + +from cases import CASES, select_compared_region +from st_common import result_cmp, style_fail, style_pass, validate_cases + + +def main(): + validate_cases(CASES) + case_filter = sys.argv[1] if len(sys.argv) > 1 else None + + all_passed = True + for case in CASES: + if case_filter is not None and case["name"] != case_filter: + continue + + case_dir = case["name"] + shape = case["shape"] + + golden = np.fromfile(os.path.join(case_dir, "golden.bin"), dtype=case["dtype"]).reshape(shape) + output = np.fromfile(os.path.join(case_dir, "output.bin"), dtype=case["dtype"]).reshape(shape) + + ok = result_cmp( + select_compared_region(case, golden), + select_compared_region(case, output), + case["eps"], + ) + if ok: + print(style_pass(f"[INFO] {case['name']}: compare passed")) + else: + print(style_fail(f"[ERROR] {case['name']}: compare failed")) + all_passed = False + + if not all_passed: + sys.exit(2) + print(style_pass("[INFO] all cases passed")) + + +if __name__ == "__main__": + main() diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tload/gen_data.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tload/gen_data.py new file mode 100644 index 000000000..449291f26 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tload/gen_data.py @@ -0,0 +1,30 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +import numpy as np +from cases import CASES, build_expected_output +from st_common import validate_cases, setup_case_rng, save_case_data + +validate_cases(CASES) + +for case in CASES: + setup_case_rng(case) + + dtype = case["dtype"] + shape = case["shape"] + vr, vc = case["valid_shape"] + + input_arr = np.random.randint(1, 17, size=shape).astype(dtype) + golden = build_expected_output(case, input_arr) + + save_case_data(case["name"], {"input": input_arr, "golden": golden}) + print( + f"[INFO] gen_data: {case['name']} shape={shape} " + f"valid_shape={(vr, vc)} dtype={dtype.__name__}" + ) diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tload/launch.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tload/launch.cpp new file mode 100644 index 000000000..bc7998c4a --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tload/launch.cpp @@ -0,0 +1,26 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +#include + +#ifndef AICORE +#define AICORE [aicore] +#endif + +extern "C" __global__ AICORE void TLOAD_ND_f32_16x64(__gm__ float *src, __gm__ float *dst); +extern "C" __global__ AICORE void TLOAD_DN_PAD_MAX_f32_16x64(__gm__ float *src, __gm__ float *dst); + +void LaunchTLOAD_ND_f32_16x64(float *src, float *dst, void *stream) { + TLOAD_ND_f32_16x64<<<1, nullptr, stream>>>((__gm__ float *)src, (__gm__ float *)dst); +} + + + +void LaunchTLOAD_DN_PAD_MAX_f32_16x64(float *src, float *dst, void *stream) { + TLOAD_DN_PAD_MAX_f32_16x64<<<1, nullptr, stream>>>((__gm__ float *)src, (__gm__ float *)dst); +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tload/main.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tload/main.cpp new file mode 100644 index 000000000..f721bcbc5 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tload/main.cpp @@ -0,0 +1,140 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// Host driver for TileLang tload/tstore ST. +// Each case performs a GM -> Tile -> GM round trip and compare.py checks that +// output.bin matches input.bin exactly for the requested layout. + +#include "acl/acl.h" +#include "test_common.h" +#include +#include +#include +#include +#include + +using namespace PtoTestCommon; + +void LaunchTLOAD_ND_f32_16x64(float *src, float *dst, void *stream); +void LaunchTLOAD_DN_f32_16x64(float *src, float *dst, void *stream); +void LaunchTLOAD_ND_PAD_ZERO_f32_16x64(float *src, float *dst, void *stream); +void LaunchTLOAD_DN_PAD_MAX_f32_16x64(float *src, float *dst, void *stream); +void LaunchTLOAD_NZ_PAD_MIN_f32_128x128(float *src, float *dst, void *stream); + +using LaunchFn = void (*)(float *, float *, void *); + +struct TestCase { + const char *name; + LaunchFn launch; + size_t rows; + size_t cols; + size_t elemSize; +}; + +static const TestCase kCases[] = { +{"nd_f32_16x64", LaunchTLOAD_ND_f32_16x64, 16, 64, sizeof(float)}, +{"dn_pad_max_f32_16x64", LaunchTLOAD_DN_PAD_MAX_f32_16x64, 16, 64, sizeof(float)}, +}; +static constexpr size_t kNumCases = sizeof(kCases) / sizeof(kCases[0]); + +static int RunCase(const TestCase &tc, aclrtStream stream) { + int rc = 0; + const size_t elemCount = tc.rows * tc.cols; + const size_t fileSize = elemCount * tc.elemSize; + + std::printf("[INFO] === case: %s (%zux%zu) ===\n", tc.name, tc.rows, tc.cols); + + std::string caseDir = std::string("./") + tc.name; + size_t inputFileSize = fileSize; + + float *srcHost = nullptr; + float *dstHost = nullptr; + float *srcDevice = nullptr; + float *dstDevice = nullptr; + + aclrtMallocHost((void **)(&srcHost), fileSize); + aclrtMallocHost((void **)(&dstHost), fileSize); + aclrtMalloc((void **)&srcDevice, fileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc((void **)&dstDevice, fileSize, ACL_MEM_MALLOC_HUGE_FIRST); + + if (!ReadFile((caseDir + "/input.bin").c_str(), inputFileSize, srcHost, fileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input.bin\n", caseDir.c_str()); + rc = 1; + } + + if (rc == 0) { + aclrtMemcpy(srcDevice, fileSize, srcHost, fileSize, ACL_MEMCPY_HOST_TO_DEVICE); + tc.launch(srcDevice, dstDevice, stream); + aclrtSynchronizeStream(stream); + aclrtMemcpy(dstHost, fileSize, dstDevice, fileSize, ACL_MEMCPY_DEVICE_TO_HOST); + } + + if (rc == 0 && !WriteFile((caseDir + "/output.bin").c_str(), dstHost, fileSize)) { + std::fprintf(stderr, "[ERROR] failed to write %s/output.bin\n", caseDir.c_str()); + rc = 1; + } + + if (srcDevice != nullptr) + aclrtFree(srcDevice); + if (dstDevice != nullptr) + aclrtFree(dstDevice); + if (srcHost != nullptr) + aclrtFreeHost(srcHost); + if (dstHost != nullptr) + aclrtFreeHost(dstHost); + + if (rc == 0) + std::printf("[INFO] case %s done\n", tc.name); + return rc; +} + +int main(int argc, char *argv[]) { + const char *caseFilter = (argc > 1) ? argv[1] : nullptr; + + int rc = 0; + bool matchedCase = (caseFilter == nullptr); + int deviceId = 0; + aclrtStream stream = nullptr; + + aclInit(nullptr); + if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { + deviceId = std::atoi(envDevice); + } + aclrtSetDevice(deviceId); + aclrtCreateStream(&stream); + + for (size_t i = 0; i < kNumCases; ++i) { + if (caseFilter != nullptr && std::strcmp(kCases[i].name, caseFilter) != 0) { + continue; + } + matchedCase = true; + int ret = RunCase(kCases[i], stream); + if (ret != 0) { + std::fprintf(stderr, "[ERROR] case %s failed\n", kCases[i].name); + rc = 1; + break; + } + } + + if (!matchedCase) { + std::fprintf(stderr, "[ERROR] unknown case filter: %s\n", caseFilter); + std::fprintf(stderr, "[ERROR] supported cases:"); + for (size_t i = 0; i < kNumCases; ++i) { + std::fprintf(stderr, " %s", kCases[i].name); + } + std::fprintf(stderr, "\n"); + rc = 1; + } + + if (stream != nullptr) + aclrtDestroyStream(stream); + aclrtResetDevice(deviceId); + aclFinalize(); + + return rc; +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tload/tload.pto b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tload/tload.pto new file mode 100644 index 000000000..65c93e4ae --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tload/tload.pto @@ -0,0 +1,85 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// TileLang ST kernels for pto.tload + pto.tstore round-trip coverage. +// Each kernel only performs GM -> Tile -> GM, so the testcase validates the +// DMA layout path directly for ND, DN, and NZ vector tiles. + +module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { + func.func @TLOAD_ND_f32_16x64(%src_ptr: !pto.ptr, %dst_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c16 = arith.constant 16 : index + %c64 = arith.constant 64 : index + %c1024 = arith.constant 1024 : index + + %src_view = pto.make_tensor_view %src_ptr, + shape = [%c1, %c1, %c1, %c16, %c64], + strides = [%c1024, %c1024, %c1024, %c64, %c1] + : !pto.tensor_view<1x1x1x16x64xf32> + %dst_view = pto.make_tensor_view %dst_ptr, + shape = [%c1, %c1, %c1, %c16, %c64], + strides = [%c1024, %c1024, %c1024, %c64, %c1] + : !pto.tensor_view<1x1x1x16x64xf32> + + %src_part = pto.partition_view %src_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c16, %c64] + : !pto.tensor_view<1x1x1x16x64xf32> -> !pto.partition_tensor_view<1x1x1x16x64xf32> + %dst_part = pto.partition_view %dst_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c16, %c64] + : !pto.tensor_view<1x1x1x16x64xf32> -> !pto.partition_tensor_view<1x1x1x16x64xf32> + + %tile = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%src_part : !pto.partition_tensor_view<1x1x1x16x64xf32>) + outs(%tile : !pto.tile_buf) + pto.tstore ins(%tile : !pto.tile_buf) + outs(%dst_part : !pto.partition_tensor_view<1x1x1x16x64xf32>) + return + } + + + func.func @TLOAD_DN_PAD_MAX_f32_16x64(%src_ptr: !pto.ptr, %dst_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c15 = arith.constant 15 : index + %c16 = arith.constant 16 : index + %c64 = arith.constant 64 : index + %c1024 = arith.constant 1024 : index + + %src_view = pto.make_tensor_view %src_ptr, + shape = [%c1, %c1, %c1, %c15, %c64], + strides = [%c1024, %c1024, %c1024, %c1, %c16] + : !pto.tensor_view<1x1x1x15x64xf32> + %dst_view = pto.make_tensor_view %dst_ptr, + shape = [%c1, %c1, %c1, %c15, %c64], + strides = [%c1024, %c1024, %c1024, %c1, %c16] + : !pto.tensor_view<1x1x1x15x64xf32> + + %src_part = pto.partition_view %src_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c15, %c64] + : !pto.tensor_view<1x1x1x15x64xf32> -> !pto.partition_tensor_view<1x1x1x15x64xf32> + %dst_part = pto.partition_view %dst_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c15, %c64] + : !pto.tensor_view<1x1x1x15x64xf32> -> !pto.partition_tensor_view<1x1x1x15x64xf32> + + %tile = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%src_part : !pto.partition_tensor_view<1x1x1x15x64xf32>) + outs(%tile : !pto.tile_buf) + pto.tstore ins(%tile : !pto.tile_buf) + outs(%dst_part : !pto.partition_tensor_view<1x1x1x15x64xf32>) + return + } +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tlog/CMakeLists.txt b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tlog/CMakeLists.txt new file mode 100644 index 000000000..f17ca9cf8 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tlog/CMakeLists.txt @@ -0,0 +1,9 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +pto_tilelang_vec_st(tlog) diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tlog/cases.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tlog/cases.py new file mode 100644 index 000000000..5dd8c5ce1 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tlog/cases.py @@ -0,0 +1,94 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +"""Single source of truth for tlog ST test cases. + +Each case defines: + - name: case identifier, used as subdirectory name and by main.cpp kCases[]. + - dtype: numpy dtype (e.g. np.float32). + - shape: (rows, cols) — allocated tile dimensions. + - valid_shape: (valid_rows, valid_cols) — effective computation region. + - eps: tolerance for numpy.allclose (atol and rtol). + +gen_data.py and compare.py both import this list to avoid redundant definitions. +""" + +import numpy as np + +CASES = [ + { + "name": "f32_16x64", + "dtype": np.float32, + "shape": (16, 64), + "valid_shape": (16, 64), + "eps": 1e-6, + }, + { + "name": "f32_32x32", + "dtype": np.float32, + "shape": (32, 32), + "valid_shape": (32, 32), + "eps": 1e-6, + }, + { + "name": "f16_16x64", + "dtype": np.float16, + "shape": (16, 64), + "valid_shape": (16, 64), + "eps": 1e-3, + }, + { + "name": "f16_32x32", + "dtype": np.float16, + "shape": (32, 32), + "valid_shape": (32, 32), + "eps": 1e-3, + }, + { + "name": "f32_16x64_hp", + "dtype": np.float32, + "shape": (16, 64), + "valid_shape": (16, 64), + "eps": 1e-5, + "precision_type": "high_precision", + }, + { + "name": "f32_32x32_hp", + "dtype": np.float32, + "shape": (32, 32), + "valid_shape": (32, 32), + "eps": 1e-5, + "precision_type": "high_precision", + }, + { + "name": "f16_16x64_hp", + "dtype": np.float16, + "shape": (16, 64), + "valid_shape": (16, 64), + "eps": 1e-3, + "precision_type": "high_precision", + }, + { + "name": "f16_32x32_hp", + "dtype": np.float16, + "shape": (32, 32), + "valid_shape": (32, 32), + "eps": 1e-3, + "precision_type": "high_precision", + }, +] + +_SMOKE_CASE_NAMES = ['f32_16x64', 'f16_16x64_hp'] +_SMOKE_CASE_NAME_SET = set(_SMOKE_CASE_NAMES) +_missing = [name for name in _SMOKE_CASE_NAMES if name not in {case["name"] for case in CASES}] +if _missing: + raise RuntimeError("unknown smoke case(s): " + ", ".join(_missing)) +CASES = [case for case in CASES if case["name"] in _SMOKE_CASE_NAME_SET] diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tlog/compare.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tlog/compare.py new file mode 100644 index 000000000..428604929 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tlog/compare.py @@ -0,0 +1,49 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import os +import sys +import numpy as np + +from cases import CASES +from st_common import result_cmp, style_fail, style_pass, validate_cases + + +def main(): + validate_cases(CASES) + case_filter = sys.argv[1] if len(sys.argv) > 1 else None + + all_passed = True + for case in CASES: + if case_filter is not None and case["name"] != case_filter: + continue + + case_dir = case["name"] + shape = case["shape"] + vr, vc = case["valid_shape"] + + golden = np.fromfile(os.path.join(case_dir, "golden.bin"), dtype=case["dtype"]).reshape(shape) + output = np.fromfile(os.path.join(case_dir, "output.bin"), dtype=case["dtype"]).reshape(shape) + + ok = result_cmp(golden[:vr, :vc], output[:vr, :vc], case["eps"]) + if ok: + print(style_pass(f"[INFO] {case['name']}: compare passed")) + else: + print(style_fail(f"[ERROR] {case['name']}: compare failed")) + all_passed = False + + if not all_passed: + sys.exit(2) + print(style_pass("[INFO] all cases passed")) + + +if __name__ == "__main__": + main() diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tlog/gen_data.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tlog/gen_data.py new file mode 100644 index 000000000..459d8fb12 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tlog/gen_data.py @@ -0,0 +1,33 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import numpy as np +from cases import CASES +from st_common import validate_cases, setup_case_rng, save_case_data + +validate_cases(CASES) + +for case in CASES: + setup_case_rng(case) + + dtype = case["dtype"] + shape = case["shape"] + valid_shape = case["valid_shape"] + + # Generate positive random values for log (log requires positive inputs) + input = np.random.uniform(0.1, 10.0, size=shape).astype(dtype) + + golden = np.zeros(shape, dtype=dtype) + vr, vc = valid_shape + golden[:vr, :vc] = np.log(input[:vr, :vc]).astype(dtype, copy=False) + + save_case_data(case["name"], {"input": input, "golden": golden}) + print(f"[INFO] gen_data: {case['name']} shape={shape} valid_shape={valid_shape} dtype={dtype.__name__}") diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tlog/launch.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tlog/launch.cpp new file mode 100644 index 000000000..44e3e557e --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tlog/launch.cpp @@ -0,0 +1,28 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +#include + +#ifndef AICORE +#define AICORE [aicore] +#endif + +// Case 0: f32 16x64 + +extern "C" __global__ AICORE void TLOG_f32_16x64(__gm__ float *a, __gm__ float *b); +extern "C" __global__ AICORE void TLOG_f16_16x64_hp(__gm__ uint16_t *a, __gm__ uint16_t *b); + +void LaunchTLOG_f32_16x64(void *a, void *b, void *stream) { + TLOG_f32_16x64<<<1, nullptr, stream>>>((__gm__ float *)a, (__gm__ float *)b); +} + + + +void LaunchTLOG_f16_16x64_hp(void *a, void *b, void *stream) { + TLOG_f16_16x64_hp<<<1, nullptr, stream>>>((__gm__ uint16_t *)a, (__gm__ uint16_t *)b); +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tlog/main.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tlog/main.cpp new file mode 100644 index 000000000..381cfa20e --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tlog/main.cpp @@ -0,0 +1,137 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// Host driver for TileLang tlog ST — case-table driven. +// Each case launches a different kernel variant, reads/writes from per-case subdirectory. +// Numerical comparison is done externally by compare.py. + +#include "acl/acl.h" +#include "test_common.h" +#include +#include +#include +#include +#include +#include + +using namespace PtoTestCommon; + +// Kernel launch wrappers (defined in launch.cpp) +void LaunchTLOG_f32_16x64(void *a, void *b, void *stream); +void LaunchTLOG_f32_32x32(void *a, void *b, void *stream); +void LaunchTLOG_f16_32x32(void *a, void *b, void *stream); +void LaunchTLOG_f32_32x32_hp(void *a, void *b, void *stream); +void LaunchTLOG_f16_16x64_hp(void *a, void *b, void *stream); +void LaunchTLOG_f16_32x32_hp(void *a, void *b, void *stream); + +using LaunchFn = void (*)(void *, void *, void *); + +struct TestCase { + const char *name; + LaunchFn launch; + size_t rows; // allocated tile rows + size_t cols; // allocated tile cols + size_t validRows; // effective computation rows (<= rows) + size_t validCols; // effective computation cols (<= cols) + size_t elemSize; // bytes per element +}; + +static const TestCase kCases[] = { +{"f32_16x64", LaunchTLOG_f32_16x64, 16, 64, 16, 64, sizeof(float)}, +{"f16_16x64_hp", LaunchTLOG_f16_16x64_hp, 16, 64, 16, 64, sizeof(uint16_t)}, +}; +static constexpr size_t kNumCases = sizeof(kCases) / sizeof(kCases[0]); + +static int RunCase(const TestCase &tc, int deviceId, aclrtStream stream) { + int rc = 0; + const size_t elemCount = tc.rows * tc.cols; + const size_t fileSize = elemCount * tc.elemSize; + + std::printf("[INFO] === case: %s (shape=%zux%zu, valid=%zux%zu) ===\n", + tc.name, tc.rows, tc.cols, tc.validRows, tc.validCols); + + // Per-case data directory + std::string caseDir = std::string("./") + tc.name; + size_t srcFileSize = fileSize; + + void *srcHost = nullptr, *dstHost = nullptr; + void *srcDevice = nullptr, *dstDevice = nullptr; + + aclrtMallocHost((void **)(&srcHost), fileSize); + aclrtMallocHost((void **)(&dstHost), fileSize); + + aclrtMalloc((void **)&srcDevice, fileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc((void **)&dstDevice, fileSize, ACL_MEM_MALLOC_HUGE_FIRST); + + if (!ReadFile((caseDir + "/input.bin").c_str(), srcFileSize, srcHost, fileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input.bin\n", caseDir.c_str()); + rc = 1; + } + + if (rc == 0) { + aclrtMemcpy(srcDevice, fileSize, srcHost, fileSize, ACL_MEMCPY_HOST_TO_DEVICE); + + tc.launch(srcDevice, dstDevice, stream); + + aclrtSynchronizeStream(stream); + aclrtMemcpy(dstHost, fileSize, dstDevice, fileSize, ACL_MEMCPY_DEVICE_TO_HOST); + } + + if (rc == 0 && !WriteFile((caseDir + "/output.bin").c_str(), dstHost, fileSize)) { + std::fprintf(stderr, "[ERROR] failed to write %s/output.bin\n", caseDir.c_str()); + rc = 1; + } + + if (srcDevice != nullptr) + aclrtFree(srcDevice); + if (dstDevice != nullptr) + aclrtFree(dstDevice); + if (srcHost != nullptr) + aclrtFreeHost(srcHost); + if (dstHost != nullptr) + aclrtFreeHost(dstHost); + + if (rc == 0) + std::printf("[INFO] case %s done\n", tc.name); + return rc; +} + +int main(int argc, char *argv[]) { + // Optional case filter: ./tlog [case_name] + const char *caseFilter = (argc > 1) ? argv[1] : nullptr; + + int rc = 0; + int deviceId = 0; + aclrtStream stream = nullptr; + + aclInit(nullptr); + if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { + deviceId = std::atoi(envDevice); + } + aclrtSetDevice(deviceId); + aclrtCreateStream(&stream); + + for (size_t i = 0; i < kNumCases; ++i) { + if (caseFilter != nullptr && std::strcmp(kCases[i].name, caseFilter) != 0) { + continue; + } + int ret = RunCase(kCases[i], deviceId, stream); + if (ret != 0) { + std::fprintf(stderr, "[ERROR] case %s failed\n", kCases[i].name); + rc = 1; + break; + } + } + + if (stream != nullptr) + aclrtDestroyStream(stream); + aclrtResetDevice(deviceId); + aclFinalize(); + + return rc; +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tlog/tlog.pto b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tlog/tlog.pto new file mode 100644 index 000000000..df491091a --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tlog/tlog.pto @@ -0,0 +1,102 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// TileLang ST kernels for pto.tlog: tload(a) + tlog(a)->b + tstore(b). +// Multiple cases with different shapes in a single module. +// Compiled by ptoas --enable-insert-sync --enable-tile-op-expand --pto-backend=vpto +// to produce a fatobj object. + +module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { + // Case 0: f32 16x64 (1024 elements) + func.func @TLOG_f32_16x64(%a_ptr: !pto.ptr, %b_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c16 = arith.constant 16 : index + %c64 = arith.constant 64 : index + %c1024 = arith.constant 1024 : index + + %a_view = pto.make_tensor_view %a_ptr, + shape = [%c1, %c1, %c1, %c16, %c64], + strides = [%c1024, %c1024, %c1024, %c64, %c1] + : !pto.tensor_view<1x1x1x16x64xf32> + %b_view = pto.make_tensor_view %b_ptr, + shape = [%c1, %c1, %c1, %c16, %c64], + strides = [%c1024, %c1024, %c1024, %c64, %c1] + : !pto.tensor_view<1x1x1x16x64xf32> + + %a_part = pto.partition_view %a_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c16, %c64] + : !pto.tensor_view<1x1x1x16x64xf32> -> !pto.partition_tensor_view<1x1x1x16x64xf32> + %b_part = pto.partition_view %b_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c16, %c64] + : !pto.tensor_view<1x1x1x16x64xf32> -> !pto.partition_tensor_view<1x1x1x16x64xf32> + + %a = pto.alloc_tile + : !pto.tile_buf + %b = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%a_part : !pto.partition_tensor_view<1x1x1x16x64xf32>) + outs(%a : !pto.tile_buf) + + pto.tlog ins(%a : !pto.tile_buf) + outs(%b : !pto.tile_buf) + + pto.tstore ins(%b : !pto.tile_buf) + outs(%b_part : !pto.partition_tensor_view<1x1x1x16x64xf32>) + return + } + + // Case 1: f32 32x32 (1024 elements) + + func.func @TLOG_f16_16x64_hp(%a_ptr: !pto.ptr, %b_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c16 = arith.constant 16 : index + %c64 = arith.constant 64 : index + %c1024 = arith.constant 1024 : index + + %a_view = pto.make_tensor_view %a_ptr, + shape = [%c1, %c1, %c1, %c16, %c64], + strides = [%c1024, %c1024, %c1024, %c64, %c1] + : !pto.tensor_view<1x1x1x16x64xf16> + %b_view = pto.make_tensor_view %b_ptr, + shape = [%c1, %c1, %c1, %c16, %c64], + strides = [%c1024, %c1024, %c1024, %c64, %c1] + : !pto.tensor_view<1x1x1x16x64xf16> + + %a_part = pto.partition_view %a_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c16, %c64] + : !pto.tensor_view<1x1x1x16x64xf16> -> !pto.partition_tensor_view<1x1x1x16x64xf16> + %b_part = pto.partition_view %b_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c16, %c64] + : !pto.tensor_view<1x1x1x16x64xf16> -> !pto.partition_tensor_view<1x1x1x16x64xf16> + + %a = pto.alloc_tile + : !pto.tile_buf + %b = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%a_part : !pto.partition_tensor_view<1x1x1x16x64xf16>) + outs(%a : !pto.tile_buf) + + pto.tlog ins(%a : !pto.tile_buf) + outs(%b : !pto.tile_buf) + {precisionType = #pto} + + pto.tstore ins(%b : !pto.tile_buf) + outs(%b_part : !pto.partition_tensor_view<1x1x1x16x64xf16>) + return + } + + // Case 8: f16 32x32 high precision (1024 elements) +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tlrelu/CMakeLists.txt b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tlrelu/CMakeLists.txt new file mode 100644 index 000000000..885abd64b --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tlrelu/CMakeLists.txt @@ -0,0 +1,9 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +pto_tilelang_vec_st(tlrelu) diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tlrelu/cases.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tlrelu/cases.py new file mode 100644 index 000000000..eb4f86680 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tlrelu/cases.py @@ -0,0 +1,72 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +"""Single source of truth for tlrelu ST test cases. + +Each case defines: + - name: case identifier, used as subdirectory name and by main.cpp kCases[]. + - dtype: numpy dtype (e.g. np.float32). + - shape: (rows, cols) — src tile dimensions (UB allocation). + - valid_shape: (valid_rows, valid_cols) — effective computation region. + - dst_shape: (rows, cols) — dst tile physical dimensions (UB allocation, may have padding). + - dst_valid_shape: (valid_rows, valid_cols) — dst effective region (same as valid_shape). + - eps: tolerance for numpy.allclose (atol and rtol). + +gen_data.py and compare.py both import this list to avoid redundant definitions. +""" + +import numpy as np + +CASES = [ + { + "name": "f32_32x64_dst128", + "dtype": np.float32, + "shape": (32, 64), + "valid_shape": (32, 64), + "dst_shape": (32, 128), + "dst_valid_shape": (32, 64), + "eps": 1e-3, + }, + { + "name": "f16_63x64_dst128", + "dtype": np.float16, + "shape": (63, 64), + "valid_shape": (63, 64), + "dst_shape": (63, 128), + "dst_valid_shape": (63, 64), + "eps": 1e-3, + }, + { + "name": "f32_7x448_dst512", + "dtype": np.float32, + "shape": (7, 448), + "valid_shape": (7, 448), + "dst_shape": (7, 512), + "dst_valid_shape": (7, 448), + "eps": 1e-3, + }, + { + "name": "f32_256x16_dst32", + "dtype": np.float32, + "shape": (256, 16), + "valid_shape": (256, 16), + "dst_shape": (256, 32), + "dst_valid_shape": (256, 16), + "eps": 1e-3, + }, +] + +_SMOKE_CASE_NAMES = ['f32_32x64_dst128', 'f32_7x448_dst512'] +_SMOKE_CASE_NAME_SET = set(_SMOKE_CASE_NAMES) +_missing = [name for name in _SMOKE_CASE_NAMES if name not in {case["name"] for case in CASES}] +if _missing: + raise RuntimeError("unknown smoke case(s): " + ", ".join(_missing)) +CASES = [case for case in CASES if case["name"] in _SMOKE_CASE_NAME_SET] diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tlrelu/compare.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tlrelu/compare.py new file mode 100644 index 000000000..207326ea2 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tlrelu/compare.py @@ -0,0 +1,49 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import os +import sys +import numpy as np + +from cases import CASES +from st_common import result_cmp, style_fail, style_pass, validate_cases + + +def main(): + validate_cases(CASES) + case_filter = sys.argv[1] if len(sys.argv) > 1 else None + + all_passed = True + for case in CASES: + if case_filter is not None and case["name"] != case_filter: + continue + + case_dir = case["name"] + dst_shape = case["dst_shape"] + vr, vc = case["valid_shape"] + + golden = np.fromfile(os.path.join(case_dir, "golden.bin"), dtype=case["dtype"]).reshape(dst_shape) + output = np.fromfile(os.path.join(case_dir, "output.bin"), dtype=case["dtype"]).reshape(dst_shape) + + ok = result_cmp(golden[:vr, :vc], output[:vr, :vc], case["eps"]) + if ok: + print(style_pass(f"[INFO] {case['name']}: compare passed")) + else: + print(style_fail(f"[ERROR] {case['name']}: compare failed")) + all_passed = False + + if not all_passed: + sys.exit(2) + print(style_pass("[INFO] all cases passed")) + + +if __name__ == "__main__": + main() diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tlrelu/gen_data.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tlrelu/gen_data.py new file mode 100644 index 000000000..bf442ee74 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tlrelu/gen_data.py @@ -0,0 +1,45 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import numpy as np +import struct +from cases import CASES +from st_common import validate_cases, setup_case_rng, save_case_data + +validate_cases(CASES) + +for case in CASES: + setup_case_rng(case) + + dtype = case["dtype"] + shape = case["shape"] + dst_shape = case["dst_shape"] + valid_shape = case["valid_shape"] + + rows, cols = shape + dst_rows, dst_cols = dst_shape + vr, vc = valid_shape + + input_arr = np.random.uniform(low=-8, high=8, size=(rows, cols)).astype(dtype) + slope = np.random.uniform(low=-8, high=8, size=(1, 1)).astype(np.float32) + golden = np.zeros((dst_rows, dst_cols), dtype=dtype) + + for i in range(vr): + for j in range(vc): + if input_arr[i, j] > 0: + golden[i, j] = input_arr[i, j] + else: + golden[i, j] = dtype(input_arr[i, j] * slope[0, 0]) + + slope_arr = np.array([slope[0, 0]], dtype=np.float32) + + save_case_data(case["name"], {"input": input_arr, "slope": slope_arr, "golden": golden}) + print(f"[INFO] gen_data: {case['name']} shape={shape} dst_shape={dst_shape} valid_shape={valid_shape} dtype={dtype.__name__}") diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tlrelu/launch.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tlrelu/launch.cpp new file mode 100644 index 000000000..c8c23612f --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tlrelu/launch.cpp @@ -0,0 +1,28 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +#include + +#ifndef AICORE +#define AICORE [aicore] +#endif + +// Case 0: f32 32x64 -> dst 32x128 (valid 32x64) + +extern "C" __global__ AICORE void TLRELU_f32_32x64_dst128(__gm__ float *src, __gm__ float *dst, float slope); +extern "C" __global__ AICORE void TLRELU_f32_7x448_dst512(__gm__ float *src, __gm__ float *dst, float slope); + +void LaunchTLRELU_f32_7x448_dst512(float *src, float *dst, float slope, void *stream) { + TLRELU_f32_7x448_dst512<<<1, nullptr, stream>>>((__gm__ float *)src, (__gm__ float *)dst, slope); +} + + + +void LaunchTLRELU_f32_32x64_dst128(float *src, float *dst, float slope, void *stream) { + TLRELU_f32_32x64_dst128<<<1, nullptr, stream>>>((__gm__ float *)src, (__gm__ float *)dst, slope); +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tlrelu/main.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tlrelu/main.cpp new file mode 100644 index 000000000..7bb1a23eb --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tlrelu/main.cpp @@ -0,0 +1,152 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// Host driver for TileLang tlrelu ST — case-table driven. +// Each case launches a different kernel variant, reads/writes from per-case subdirectory. +// Numerical comparison is done externally by compare.py. + +#include "acl/acl.h" +#include "test_common.h" +#include +#include +#include +#include +#include +#include +#include + +using namespace PtoTestCommon; + +// Kernel launch wrappers (defined in launch.cpp) +void LaunchTLRELU_f32_32x64_dst128(float *src, float *dst, float slope, void *stream); +void LaunchTLRELU_f16_63x64_dst128(uint16_t *src, uint16_t *dst, float slope, void *stream); +void LaunchTLRELU_f32_7x448_dst512(float *src, float *dst, float slope, void *stream); +void LaunchTLRELU_f32_256x16_dst32(float *src, float *dst, float slope, void *stream); + +using LaunchFn = void (*)(void *, void *, float, void *); + +struct TestCase { + const char *name; + LaunchFn launch; + size_t srcRows; // src tile rows + size_t srcCols; // src tile cols + size_t dstRows; // dst tile rows (may have padding) + size_t dstCols; // dst tile cols (may have padding) + size_t validRows; // effective computation rows (<= srcRows, dstRows) + size_t validCols; // effective computation cols (<= srcCols, dstCols) + size_t elemSize; // bytes per element + bool isFp16; // true for float16 case +}; + +static const TestCase kCases[] = { +{"f32_32x64_dst128", (LaunchFn)LaunchTLRELU_f32_32x64_dst128, 32, 64, 32, 128, 32, 64, sizeof(float), false}, +{"f32_7x448_dst512", (LaunchFn)LaunchTLRELU_f32_7x448_dst512, 7, 448, 7, 512, 7, 448, sizeof(float), false}, +}; +static constexpr size_t kNumCases = sizeof(kCases) / sizeof(kCases[0]); + +static int RunCase(const TestCase &tc, int deviceId, aclrtStream stream) { + int rc = 0; + size_t srcFileSize = tc.srcRows * tc.srcCols * tc.elemSize; + size_t dstFileSize = tc.dstRows * tc.dstCols * tc.elemSize; + size_t actualSize = 0; + + std::printf("[INFO] === case: %s (src=%zux%zu, dst=%zux%zu, valid=%zux%zu) ===\n", + tc.name, tc.srcRows, tc.srcCols, tc.dstRows, tc.dstCols, tc.validRows, tc.validCols); + + // Per-case data directory + std::string caseDir = std::string("./") + tc.name; + + void *srcHost = nullptr, *dstHost = nullptr; + void *srcDevice = nullptr, *dstDevice = nullptr; + float slope = 0.0f; + + aclrtMallocHost(&srcHost, srcFileSize); + aclrtMallocHost(&dstHost, dstFileSize); + + aclrtMalloc(&srcDevice, srcFileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc(&dstDevice, dstFileSize, ACL_MEM_MALLOC_HUGE_FIRST); + + if (!ReadFile(caseDir + "/input.bin", actualSize, srcHost, srcFileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input.bin\n", caseDir.c_str()); + rc = 1; + } + + // Read slope (4 bytes float) + if (rc == 0) { + std::ifstream slopeFile(caseDir + "/slope.bin", std::ios::binary); + if (!slopeFile) { + std::fprintf(stderr, "[ERROR] failed to open %s/slope.bin\n", caseDir.c_str()); + rc = 1; + } else { + slopeFile.read(reinterpret_cast(&slope), sizeof(float)); + slopeFile.close(); + } + } + + if (rc == 0) { + aclrtMemcpy(srcDevice, srcFileSize, srcHost, srcFileSize, ACL_MEMCPY_HOST_TO_DEVICE); + + tc.launch(srcDevice, dstDevice, slope, stream); + + aclrtSynchronizeStream(stream); + aclrtMemcpy(dstHost, dstFileSize, dstDevice, dstFileSize, ACL_MEMCPY_DEVICE_TO_HOST); + } + + if (rc == 0 && !WriteFile((caseDir + "/output.bin").c_str(), dstHost, dstFileSize)) { + std::fprintf(stderr, "[ERROR] failed to write %s/output.bin\n", caseDir.c_str()); + rc = 1; + } + + if (srcDevice != nullptr) + aclrtFree(srcDevice); + if (dstDevice != nullptr) + aclrtFree(dstDevice); + if (srcHost != nullptr) + aclrtFreeHost(srcHost); + if (dstHost != nullptr) + aclrtFreeHost(dstHost); + + if (rc == 0) + std::printf("[INFO] case %s done\n", tc.name); + return rc; +} + +int main(int argc, char *argv[]) { + // Optional case filter: ./tlrelu [case_name] + const char *caseFilter = (argc > 1) ? argv[1] : nullptr; + + int rc = 0; + int deviceId = 0; + aclrtStream stream = nullptr; + + aclInit(nullptr); + if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { + deviceId = std::atoi(envDevice); + } + aclrtSetDevice(deviceId); + aclrtCreateStream(&stream); + + for (size_t i = 0; i < kNumCases; ++i) { + if (caseFilter != nullptr && std::strcmp(kCases[i].name, caseFilter) != 0) { + continue; + } + int ret = RunCase(kCases[i], deviceId, stream); + if (ret != 0) { + std::fprintf(stderr, "[ERROR] case %s failed\n", kCases[i].name); + rc = 1; + break; + } + } + + if (stream != nullptr) + aclrtDestroyStream(stream); + aclrtResetDevice(deviceId); + aclFinalize(); + + return rc; +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tlrelu/tlrelu.pto b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tlrelu/tlrelu.pto new file mode 100644 index 000000000..18683e522 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tlrelu/tlrelu.pto @@ -0,0 +1,117 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// TileLang ST kernels for pto.tlrelu: tload(src) + tlrelu(src, slope)->dst + tstore(dst). +// Multiple cases with different shapes in a single module. +// Compiled by ptoas --enable-insert-sync --enable-tile-op-expand --pto-backend=vpto +// to produce a fatobj object. + +module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { + // Case 0: f32 src 32x64 -> dst 32x128 (valid 32x64) + func.func @TLRELU_f32_32x64_dst128(%src_ptr: !pto.ptr, %dst_ptr: !pto.ptr, %slope: f32) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c32 = arith.constant 32 : index + %c64 = arith.constant 64 : index + %c128 = arith.constant 128 : index + %c2048 = arith.constant 2048 : index + %c4096 = arith.constant 4096 : index + + // Src GM view: 1x1x1x32x64 + %src_view = pto.make_tensor_view %src_ptr, + shape = [%c1, %c1, %c1, %c32, %c64], + strides = [%c2048, %c2048, %c2048, %c64, %c1] + : !pto.tensor_view<1x1x1x32x64xf32> + + // Dst GM view: shape=valid_shape (32x64), strides based on dst allocation (32x128) + %dst_view = pto.make_tensor_view %dst_ptr, + shape = [%c1, %c1, %c1, %c32, %c64], + strides = [%c4096, %c4096, %c4096, %c128, %c1] + : !pto.tensor_view<1x1x1x32x64xf32> + + %src_part = pto.partition_view %src_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c32, %c64] + : !pto.tensor_view<1x1x1x32x64xf32> -> !pto.partition_tensor_view<1x1x1x32x64xf32> + // Dst partition: sizes = valid_shape (32x64) + %dst_part = pto.partition_view %dst_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c32, %c64] + : !pto.tensor_view<1x1x1x32x64xf32> -> !pto.partition_tensor_view<1x1x1x32x64xf32> + + // Src UB tile: 32x64, valid 32x64 + %src_tile = pto.alloc_tile + : !pto.tile_buf + // Dst UB tile: 32x64, valid 32x64 + %dst_tile = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%src_part : !pto.partition_tensor_view<1x1x1x32x64xf32>) + outs(%src_tile : !pto.tile_buf) + + pto.tlrelu ins(%src_tile, %slope : !pto.tile_buf, f32) + outs(%dst_tile : !pto.tile_buf) + + pto.tstore ins(%dst_tile : !pto.tile_buf) + outs(%dst_part : !pto.partition_tensor_view<1x1x1x32x64xf32>) + return + } + + // Case 1: f16 src 63x64 -> dst 63x128 (valid 63x64) + + func.func @TLRELU_f32_7x448_dst512(%src_ptr: !pto.ptr, %dst_ptr: !pto.ptr, %slope: f32) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c7 = arith.constant 7 : index + %c448 = arith.constant 448 : index + %c512 = arith.constant 512 : index + %c3136 = arith.constant 3136 : index + %c3584 = arith.constant 3584 : index + + // Src GM view: 1x1x1x7x448 + %src_view = pto.make_tensor_view %src_ptr, + shape = [%c1, %c1, %c1, %c7, %c448], + strides = [%c3136, %c3136, %c3136, %c448, %c1] + : !pto.tensor_view<1x1x1x7x448xf32> + + // Dst GM view: shape=valid_shape (7x448), strides based on dst allocation (7x512) + %dst_view = pto.make_tensor_view %dst_ptr, + shape = [%c1, %c1, %c1, %c7, %c448], + strides = [%c3584, %c3584, %c3584, %c512, %c1] + : !pto.tensor_view<1x1x1x7x448xf32> + + %src_part = pto.partition_view %src_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c7, %c448] + : !pto.tensor_view<1x1x1x7x448xf32> -> !pto.partition_tensor_view<1x1x1x7x448xf32> + // Dst partition: sizes = valid_shape (7x448) + %dst_part = pto.partition_view %dst_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c7, %c448] + : !pto.tensor_view<1x1x1x7x448xf32> -> !pto.partition_tensor_view<1x1x1x7x448xf32> + + // Src UB tile: 7x448, valid 7x448 + %src_tile = pto.alloc_tile + : !pto.tile_buf + // Dst UB tile: 7x448, valid 7x448 + %dst_tile = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%src_part : !pto.partition_tensor_view<1x1x1x7x448xf32>) + outs(%src_tile : !pto.tile_buf) + + pto.tlrelu ins(%src_tile, %slope : !pto.tile_buf, f32) + outs(%dst_tile : !pto.tile_buf) + + pto.tstore ins(%dst_tile : !pto.tile_buf) + outs(%dst_part : !pto.partition_tensor_view<1x1x1x7x448xf32>) + return + } + + // Case 3: f32 src 256x16 -> dst 256x32 (valid 256x16) +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmatmul/CMakeLists.txt b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmatmul/CMakeLists.txt new file mode 100644 index 000000000..c109a82c7 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmatmul/CMakeLists.txt @@ -0,0 +1,9 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +pto_tilelang_cube_st(tmatmul PTO_LEVEL level3) diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmatmul/cases.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmatmul/cases.py new file mode 100644 index 000000000..4d2dfb5a3 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmatmul/cases.py @@ -0,0 +1,32 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +"""Single source of truth for tmatmul ST test cases.""" + +import numpy as np + + +CASES = [ + { + "name": "f16_16x16x16", + "dtype": np.float16, + "shape_a": (16, 16), + "shape_b": (16, 16), + "shape_c": (16, 16), + "eps": 1e-2, + }, +] + +_SMOKE_CASE_NAMES = ['f16_16x16x16'] +_SMOKE_CASE_NAME_SET = set(_SMOKE_CASE_NAMES) +_missing = [name for name in _SMOKE_CASE_NAMES if name not in {case["name"] for case in CASES}] +if _missing: + raise RuntimeError("unknown smoke case(s): " + ", ".join(_missing)) +CASES = [case for case in CASES if case["name"] in _SMOKE_CASE_NAME_SET] diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmatmul/compare.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmatmul/compare.py new file mode 100644 index 000000000..0074a8142 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmatmul/compare.py @@ -0,0 +1,45 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import os +import sys +import numpy as np + +from cases import CASES +from st_common import result_cmp, style_fail, style_pass + + +def main(): + case_filter = sys.argv[1] if len(sys.argv) > 1 else None + + all_passed = True + for case in CASES: + if case_filter is not None and case["name"] != case_filter: + continue + + case_dir = case["name"] + shape_c = case["shape_c"] + golden = np.fromfile(os.path.join(case_dir, "golden.bin"), dtype=np.float32).reshape(shape_c) + output = np.fromfile(os.path.join(case_dir, "output.bin"), dtype=np.float32).reshape(shape_c) + + ok = result_cmp(golden, output, case["eps"]) + if ok: + print(style_pass(f"[INFO] {case['name']}: compare passed")) + else: + print(style_fail(f"[ERROR] {case['name']}: compare failed")) + all_passed = False + + if not all_passed: + sys.exit(2) + print(style_pass("[INFO] all cases passed")) + + +if __name__ == "__main__": + main() diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmatmul/gen_data.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmatmul/gen_data.py new file mode 100644 index 000000000..6835cda62 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmatmul/gen_data.py @@ -0,0 +1,32 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import numpy as np + +from cases import CASES +from st_common import setup_case_rng, save_case_data + + +for case in CASES: + setup_case_rng(case) + + shape_a = case["shape_a"] + shape_b = case["shape_b"] + dtype = case["dtype"] + + lhs = np.random.uniform(-1.0, 1.0, size=shape_a).astype(dtype) + rhs = np.random.uniform(-1.0, 1.0, size=shape_b).astype(dtype) + golden = np.matmul(lhs.astype(np.float32), rhs.astype(np.float32)).astype(np.float32) + + save_case_data(case["name"], {"input1": lhs, "input2": rhs, "golden": golden}) + print( + f"[INFO] gen_data: {case['name']} " + f"lhs={shape_a} rhs={shape_b} out={case['shape_c']} dtype={dtype.__name__}" + ) diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmatmul/launch.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmatmul/launch.cpp new file mode 100644 index 000000000..ac4b3c48a --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmatmul/launch.cpp @@ -0,0 +1,19 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +#include + +#ifndef AICORE +#define AICORE [aicore] +#endif + +extern "C" __global__ AICORE void TMATMUL_f16_16x16x16(__gm__ uint16_t *a, __gm__ uint16_t *b, __gm__ float *c); + +void LaunchTMATMUL_f16_16x16x16(uint16_t *a, uint16_t *b, float *c, void *stream) { + TMATMUL_f16_16x16x16<<<1, nullptr, stream>>>((__gm__ uint16_t *)a, (__gm__ uint16_t *)b, (__gm__ float *)c); +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmatmul/main.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmatmul/main.cpp new file mode 100644 index 000000000..50a57733b --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmatmul/main.cpp @@ -0,0 +1,158 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +#include "acl/acl.h" +#include "test_common.h" +#include +#include +#include +#include +#include + +using namespace PtoTestCommon; + +void LaunchTMATMUL_f16_16x16x16(uint16_t *a, uint16_t *b, float *c, void *stream); + +using LaunchFn = void (*)(uint16_t *, uint16_t *, float *, void *); + +struct TestCase { + const char *name; + LaunchFn launch; + size_t lhsRows; + size_t lhsCols; + size_t rhsRows; + size_t rhsCols; + size_t outRows; + size_t outCols; +}; + +static const TestCase kCases[] = { +{"f16_16x16x16", LaunchTMATMUL_f16_16x16x16, 16, 16, 16, 16, 16, 16}, +}; +static constexpr size_t kNumCases = sizeof(kCases) / sizeof(kCases[0]); + +static int RunCase(const TestCase &tc, int deviceId, aclrtStream stream) { + (void)deviceId; + int rc = 0; + const size_t lhsElems = tc.lhsRows * tc.lhsCols; + const size_t rhsElems = tc.rhsRows * tc.rhsCols; + const size_t outElems = tc.outRows * tc.outCols; + const size_t lhsBytes = lhsElems * sizeof(uint16_t); + const size_t rhsBytes = rhsElems * sizeof(uint16_t); + const size_t outBytes = outElems * sizeof(float); + size_t lhsFileSize = lhsBytes; + size_t rhsFileSize = rhsBytes; + + std::printf( + "[INFO] === case: %s (lhs=%zux%zu, rhs=%zux%zu, out=%zux%zu) ===\n", + tc.name, + tc.lhsRows, + tc.lhsCols, + tc.rhsRows, + tc.rhsCols, + tc.outRows, + tc.outCols + ); + + std::string caseDir = std::string("./") + tc.name; + + void *lhsHost = nullptr; + void *rhsHost = nullptr; + void *outHost = nullptr; + void *lhsDevice = nullptr; + void *rhsDevice = nullptr; + void *outDevice = nullptr; + + aclrtMallocHost(&lhsHost, lhsBytes); + aclrtMallocHost(&rhsHost, rhsBytes); + aclrtMallocHost(&outHost, outBytes); + + aclrtMalloc(&lhsDevice, lhsBytes, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc(&rhsDevice, rhsBytes, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc(&outDevice, outBytes, ACL_MEM_MALLOC_HUGE_FIRST); + + if (!ReadFile((caseDir + "/input1.bin").c_str(), lhsFileSize, lhsHost, lhsBytes)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input1.bin\n", caseDir.c_str()); + rc = 1; + } + if (rc == 0 && !ReadFile((caseDir + "/input2.bin").c_str(), rhsFileSize, rhsHost, rhsBytes)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input2.bin\n", caseDir.c_str()); + rc = 1; + } + + if (rc == 0) { + aclrtMemcpy(lhsDevice, lhsBytes, lhsHost, lhsBytes, ACL_MEMCPY_HOST_TO_DEVICE); + aclrtMemcpy(rhsDevice, rhsBytes, rhsHost, rhsBytes, ACL_MEMCPY_HOST_TO_DEVICE); + + tc.launch( + static_cast(lhsDevice), + static_cast(rhsDevice), + static_cast(outDevice), + stream + ); + + aclrtSynchronizeStream(stream); + aclrtMemcpy(outHost, outBytes, outDevice, outBytes, ACL_MEMCPY_DEVICE_TO_HOST); + } + + if (rc == 0 && !WriteFile((caseDir + "/output.bin").c_str(), outHost, outBytes)) { + std::fprintf(stderr, "[ERROR] failed to write %s/output.bin\n", caseDir.c_str()); + rc = 1; + } + + if (lhsDevice != nullptr) + aclrtFree(lhsDevice); + if (rhsDevice != nullptr) + aclrtFree(rhsDevice); + if (outDevice != nullptr) + aclrtFree(outDevice); + if (lhsHost != nullptr) + aclrtFreeHost(lhsHost); + if (rhsHost != nullptr) + aclrtFreeHost(rhsHost); + if (outHost != nullptr) + aclrtFreeHost(outHost); + + if (rc == 0) + std::printf("[INFO] case %s done\n", tc.name); + return rc; +} + +int main(int argc, char *argv[]) { + const char *caseFilter = (argc > 1) ? argv[1] : nullptr; + + int rc = 0; + int deviceId = 0; + aclrtStream stream = nullptr; + + aclInit(nullptr); + if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { + deviceId = std::atoi(envDevice); + } + aclrtSetDevice(deviceId); + aclrtCreateStream(&stream); + + for (size_t i = 0; i < kNumCases; ++i) { + if (caseFilter != nullptr && std::strcmp(kCases[i].name, caseFilter) != 0) { + continue; + } + int ret = RunCase(kCases[i], deviceId, stream); + if (ret != 0) { + std::fprintf(stderr, "[ERROR] case %s failed\n", kCases[i].name); + rc = 1; + break; + } + } + + if (stream != nullptr) + aclrtDestroyStream(stream); + aclrtResetDevice(deviceId); + aclFinalize(); + + return rc; +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmatmul/tmatmul.pto b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmatmul/tmatmul.pto new file mode 100644 index 000000000..b688b745d --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmatmul/tmatmul.pto @@ -0,0 +1,88 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// TileLang ST kernel for cube matmul. +// Keep pto.tmatmul on the TileOp expansion path while bridging the boundary +// ops through pto.tile_buf_addr on the level3/manual-address path. + +module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { + func.func @TMATMUL_f16_16x16x16(%a_gm: !pto.ptr, %b_gm: !pto.ptr, %c_gm: !pto.ptr) attributes {pto.kernel} { + %c0_i64 = arith.constant 0 : i64 + %c1_i64 = arith.constant 1 : i64 + %c16_i64 = arith.constant 16 : i64 + %c32_i64 = arith.constant 32 : i64 + %c512_i64 = arith.constant 512 : i64 + %false = arith.constant false + + %l1_a_tile = pto.alloc_tile addr = %c0_i64 + : !pto.tile_buf + %l1_b_tile = pto.alloc_tile addr = %c512_i64 + : !pto.tile_buf + %l0a_tile = pto.alloc_tile addr = %c0_i64 + : !pto.tile_buf + %l0b_tile = pto.alloc_tile addr = %c0_i64 + : !pto.tile_buf + %l0c_tile = pto.alloc_tile addr = %c0_i64 + : !pto.tile_buf + + %l1_a = pto.tile_buf_addr %l1_a_tile + : !pto.tile_buf + -> !pto.ptr + %l1_b = pto.tile_buf_addr %l1_b_tile + : !pto.tile_buf + -> !pto.ptr + %l0a = pto.tile_buf_addr %l0a_tile + : !pto.tile_buf + -> !pto.ptr + %l0b = pto.tile_buf_addr %l0b_tile + : !pto.tile_buf + -> !pto.ptr + %l0c = pto.tile_buf_addr %l0c_tile + : !pto.tile_buf + -> !pto.ptr + pto.mte_gm_l1_frac %a_gm, %l1_a, nd2nz, + shape(%c16_i64, %c16_i64), + src_layout(%c32_i64), + dst_group(%c1_i64, %c1_i64, %c16_i64, %c0_i64), + ctrl(%c0_i64, %false) + : !pto.ptr, !pto.ptr, nd2nz, + shape i64, i64, src_layout(i64), + dst_group i64, i64, i64, i64, ctrl i64, i1 + pto.set_flag["PIPE_MTE2", "PIPE_MTE1", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_MTE1", "EVENT_ID0"] + pto.mte_l1_l0a %l1_a, %l0a, %c16_i64, %c16_i64, %c0_i64, %c0_i64 + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + + pto.mte_gm_l1_frac %b_gm, %l1_b, nd2nz, + shape(%c16_i64, %c16_i64), + src_layout(%c32_i64), + dst_group(%c1_i64, %c1_i64, %c16_i64, %c0_i64), + ctrl(%c0_i64, %false) + : !pto.ptr, !pto.ptr, nd2nz, + shape i64, i64, src_layout(i64), + dst_group i64, i64, i64, i64, ctrl i64, i1 + pto.set_flag["PIPE_MTE2", "PIPE_MTE1", "EVENT_ID1"] + pto.wait_flag["PIPE_MTE2", "PIPE_MTE1", "EVENT_ID1"] + pto.mte_l1_l0b %l1_b, %l0b, %c16_i64, %c16_i64, %c0_i64, %c0_i64 {transpose = true} + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE1", "PIPE_M", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE1", "PIPE_M", "EVENT_ID0"] + pto.tmatmul ins(%l0a_tile, %l0b_tile : !pto.tile_buf, + !pto.tile_buf) + outs(%l0c_tile : !pto.tile_buf) + + pto.set_flag["PIPE_M", "PIPE_FIX", "EVENT_ID1"] + pto.wait_flag["PIPE_M", "PIPE_FIX", "EVENT_ID1"] + pto.mte_l0c_gm %l0c, %c_gm, %c16_i64, %c16_i64, %c16_i64, %c16_i64, %c0_i64, %c0_i64, + nz2nd + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64, i64 + pto.barrier #pto.pipe + return + } +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmax/CMakeLists.txt b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmax/CMakeLists.txt new file mode 100644 index 000000000..6e0dd0872 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmax/CMakeLists.txt @@ -0,0 +1,9 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +pto_tilelang_vec_st(tmax) diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmax/cases.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmax/cases.py new file mode 100644 index 000000000..52120b579 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmax/cases.py @@ -0,0 +1,47 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +"""Single source of truth for tmax ST test cases. + +Each case defines: + - name: case identifier, used as subdirectory name and by main.cpp kCases[]. + - dtype: numpy dtype (e.g. np.float32). + - shape: (rows, cols) — allocated tile dimensions. + - valid_shape: (valid_rows, valid_cols) — effective computation region. + - eps: tolerance for numpy.allclose (atol and rtol). + +gen_data.py and compare.py both import this list to avoid redundant definitions. +""" + +import numpy as np + +CASES = [ + { + "name": "f32_16x64", + "dtype": np.float32, + "shape": (16, 64), + "valid_shape": (16, 64), + "eps": 1e-6, + }, + { + "name": "f32_32x32", + "dtype": np.float32, + "shape": (32, 32), + "valid_shape": (32, 32), + "eps": 1e-6, + }, +] + +_SMOKE_CASE_NAMES = ['f32_16x64', 'f32_32x32'] +_SMOKE_CASE_NAME_SET = set(_SMOKE_CASE_NAMES) +_missing = [name for name in _SMOKE_CASE_NAMES if name not in {case["name"] for case in CASES}] +if _missing: + raise RuntimeError("unknown smoke case(s): " + ", ".join(_missing)) +CASES = [case for case in CASES if case["name"] in _SMOKE_CASE_NAME_SET] diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmax/compare.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmax/compare.py new file mode 100644 index 000000000..6a4d5d1aa --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmax/compare.py @@ -0,0 +1,48 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import os +import sys +import numpy as np + +from cases import CASES +from st_common import result_cmp, style_fail, style_pass, validate_cases + + +def main(): + validate_cases(CASES) + case_filter = sys.argv[1] if len(sys.argv) > 1 else None + + all_passed = True + for case in CASES: + if case_filter is not None and case["name"] != case_filter: + continue + + case_dir = case["name"] + shape = case["shape"] + vr, vc = case["valid_shape"] + + golden = np.fromfile(os.path.join(case_dir, "golden.bin"), dtype=case["dtype"]).reshape(shape) + output = np.fromfile(os.path.join(case_dir, "output.bin"), dtype=case["dtype"]).reshape(shape) + + ok = result_cmp(golden[:vr, :vc], output[:vr, :vc], case["eps"]) + if ok: + print(style_pass(f"[INFO] {case['name']}: compare passed")) + else: + print(style_fail(f"[ERROR] {case['name']}: compare failed")) + all_passed = False + + if not all_passed: + sys.exit(2) + print(style_pass("[INFO] all cases passed")) + + +if __name__ == "__main__": + main() diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmax/gen_data.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmax/gen_data.py new file mode 100644 index 000000000..de80ac271 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmax/gen_data.py @@ -0,0 +1,32 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import numpy as np +from cases import CASES +from st_common import validate_cases, setup_case_rng, save_case_data + +validate_cases(CASES) + +for case in CASES: + setup_case_rng(case) + + dtype = case["dtype"] + shape = case["shape"] + valid_shape = case["valid_shape"] + + input1 = np.random.randint(1, 10, size=shape).astype(dtype) + input2 = np.random.randint(1, 10, size=shape).astype(dtype) + + golden = np.zeros(shape, dtype=dtype) + vr, vc = valid_shape + golden[:vr, :vc] = np.maximum(input1[:vr, :vc], input2[:vr, :vc]).astype(dtype, copy=False) + + save_case_data(case["name"], {"input1": input1, "input2": input2, "golden": golden}) + print(f"[INFO] gen_data: {case['name']} shape={shape} valid_shape={valid_shape} dtype={dtype.__name__}") diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmax/launch.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmax/launch.cpp new file mode 100644 index 000000000..5e7dca908 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmax/launch.cpp @@ -0,0 +1,26 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +#include + +#ifndef AICORE +#define AICORE [aicore] +#endif + +// Case 0: f32 16x64 + +extern "C" __global__ AICORE void TMAX_f32_16x64(__gm__ float *a, __gm__ float *b, __gm__ float *c); +extern "C" __global__ AICORE void TMAX_f32_32x32(__gm__ float *a, __gm__ float *b, __gm__ float *c); + +void LaunchTMAX_f32_32x32(float *a, float *b, float *c, void *stream) { + TMAX_f32_32x32<<<1, nullptr, stream>>>((__gm__ float *)a, (__gm__ float *)b, (__gm__ float *)c); +} + +void LaunchTMAX_f32_16x64(float *a, float *b, float *c, void *stream) { + TMAX_f32_16x64<<<1, nullptr, stream>>>((__gm__ float *)a, (__gm__ float *)b, (__gm__ float *)c); +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmax/main.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmax/main.cpp new file mode 100644 index 000000000..34e5a2311 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmax/main.cpp @@ -0,0 +1,145 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// Host driver for TileLang tmax ST — case-table driven. +// Each case launches a different kernel variant, reads/writes from per-case subdirectory. +// Numerical comparison is done externally by compare.py. + +#include "acl/acl.h" +#include "test_common.h" +#include +#include +#include +#include +#include +#include + +using namespace PtoTestCommon; + +// Kernel launch wrappers (defined in launch.cpp) +void LaunchTMAX_f32_16x64(float *a, float *b, float *c, void *stream); +void LaunchTMAX_f32_32x32(float *a, float *b, float *c, void *stream); + +using LaunchFn = void (*)(float *, float *, float *, void *); + +struct TestCase { + const char *name; + LaunchFn launch; + size_t rows; // allocated tile rows + size_t cols; // allocated tile cols + size_t validRows; // effective computation rows (<= rows) + size_t validCols; // effective computation cols (<= cols) + size_t elemSize; // bytes per element +}; + +static const TestCase kCases[] = { +{"f32_16x64", LaunchTMAX_f32_16x64, 16, 64, 16, 64, sizeof(float)}, +{"f32_32x32", LaunchTMAX_f32_32x32, 32, 32, 32, 32, sizeof(float)}, +}; +static constexpr size_t kNumCases = sizeof(kCases) / sizeof(kCases[0]); + +static int RunCase(const TestCase &tc, int deviceId, aclrtStream stream) { + int rc = 0; + const size_t elemCount = tc.rows * tc.cols; + const size_t fileSize = elemCount * tc.elemSize; + + std::printf("[INFO] === case: %s (shape=%zux%zu, valid=%zux%zu) ===\n", + tc.name, tc.rows, tc.cols, tc.validRows, tc.validCols); + + // Per-case data directory + std::string caseDir = std::string("./") + tc.name; + size_t src0FileSize = fileSize; + size_t src1FileSize = fileSize; + + float *src0Host = nullptr, *src1Host = nullptr, *dstHost = nullptr; + float *src0Device = nullptr, *src1Device = nullptr, *dstDevice = nullptr; + + aclrtMallocHost((void **)(&src0Host), fileSize); + aclrtMallocHost((void **)(&src1Host), fileSize); + aclrtMallocHost((void **)(&dstHost), fileSize); + + aclrtMalloc((void **)&src0Device, fileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc((void **)&src1Device, fileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc((void **)&dstDevice, fileSize, ACL_MEM_MALLOC_HUGE_FIRST); + + if (!ReadFile((caseDir + "/input1.bin").c_str(), src0FileSize, src0Host, fileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input1.bin\n", caseDir.c_str()); + rc = 1; + } + if (rc == 0 && !ReadFile((caseDir + "/input2.bin").c_str(), src1FileSize, src1Host, fileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input2.bin\n", caseDir.c_str()); + rc = 1; + } + + if (rc == 0) { + aclrtMemcpy(src0Device, fileSize, src0Host, fileSize, ACL_MEMCPY_HOST_TO_DEVICE); + aclrtMemcpy(src1Device, fileSize, src1Host, fileSize, ACL_MEMCPY_HOST_TO_DEVICE); + + tc.launch(src0Device, src1Device, dstDevice, stream); + + aclrtSynchronizeStream(stream); + aclrtMemcpy(dstHost, fileSize, dstDevice, fileSize, ACL_MEMCPY_DEVICE_TO_HOST); + } + + if (rc == 0 && !WriteFile((caseDir + "/output.bin").c_str(), dstHost, fileSize)) { + std::fprintf(stderr, "[ERROR] failed to write %s/output.bin\n", caseDir.c_str()); + rc = 1; + } + + if (src0Device != nullptr) + aclrtFree(src0Device); + if (src1Device != nullptr) + aclrtFree(src1Device); + if (dstDevice != nullptr) + aclrtFree(dstDevice); + if (src0Host != nullptr) + aclrtFreeHost(src0Host); + if (src1Host != nullptr) + aclrtFreeHost(src1Host); + if (dstHost != nullptr) + aclrtFreeHost(dstHost); + + if (rc == 0) + std::printf("[INFO] case %s done\n", tc.name); + return rc; +} + +int main(int argc, char *argv[]) { + // Optional case filter: ./tmax [case_name] + const char *caseFilter = (argc > 1) ? argv[1] : nullptr; + + int rc = 0; + int deviceId = 0; + aclrtStream stream = nullptr; + + aclInit(nullptr); + if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { + deviceId = std::atoi(envDevice); + } + aclrtSetDevice(deviceId); + aclrtCreateStream(&stream); + + for (size_t i = 0; i < kNumCases; ++i) { + if (caseFilter != nullptr && std::strcmp(kCases[i].name, caseFilter) != 0) { + continue; + } + int ret = RunCase(kCases[i], deviceId, stream); + if (ret != 0) { + std::fprintf(stderr, "[ERROR] case %s failed\n", kCases[i].name); + rc = 1; + break; + } + } + + if (stream != nullptr) + aclrtDestroyStream(stream); + aclrtResetDevice(deviceId); + aclFinalize(); + + return rc; +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmax/tmax.pto b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmax/tmax.pto new file mode 100644 index 000000000..4aaa24761 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmax/tmax.pto @@ -0,0 +1,124 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// TileLang ST kernels for pto.tmax: tload(a) + tload(b) + tmax(a,b)->c + tstore(c). +// Multiple cases with different shapes in a single module. +// Compiled by ptoas --enable-insert-sync --enable-tile-op-expand --pto-backend=vpto +// to produce a fatobj object. + +module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { + // Case 0: f32 16x64 (1024 elements) + func.func @TMAX_f32_16x64(%a_ptr: !pto.ptr, %b_ptr: !pto.ptr, %c_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c16 = arith.constant 16 : index + %c64 = arith.constant 64 : index + %c1024 = arith.constant 1024 : index + + %a_view = pto.make_tensor_view %a_ptr, + shape = [%c1, %c1, %c1, %c16, %c64], + strides = [%c1024, %c1024, %c1024, %c64, %c1] + : !pto.tensor_view<1x1x1x16x64xf32> + %b_view = pto.make_tensor_view %b_ptr, + shape = [%c1, %c1, %c1, %c16, %c64], + strides = [%c1024, %c1024, %c1024, %c64, %c1] + : !pto.tensor_view<1x1x1x16x64xf32> + %c_view = pto.make_tensor_view %c_ptr, + shape = [%c1, %c1, %c1, %c16, %c64], + strides = [%c1024, %c1024, %c1024, %c64, %c1] + : !pto.tensor_view<1x1x1x16x64xf32> + + %a_part = pto.partition_view %a_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c16, %c64] + : !pto.tensor_view<1x1x1x16x64xf32> -> !pto.partition_tensor_view<1x1x1x16x64xf32> + %b_part = pto.partition_view %b_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c16, %c64] + : !pto.tensor_view<1x1x1x16x64xf32> -> !pto.partition_tensor_view<1x1x1x16x64xf32> + %c_part = pto.partition_view %c_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c16, %c64] + : !pto.tensor_view<1x1x1x16x64xf32> -> !pto.partition_tensor_view<1x1x1x16x64xf32> + + %a = pto.alloc_tile + : !pto.tile_buf + %b = pto.alloc_tile + : !pto.tile_buf + %c = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%a_part : !pto.partition_tensor_view<1x1x1x16x64xf32>) + outs(%a : !pto.tile_buf) + pto.tload ins(%b_part : !pto.partition_tensor_view<1x1x1x16x64xf32>) + outs(%b : !pto.tile_buf) + + pto.tmax ins(%a, %b : !pto.tile_buf, + !pto.tile_buf) + outs(%c : !pto.tile_buf) + + pto.tstore ins(%c : !pto.tile_buf) + outs(%c_part : !pto.partition_tensor_view<1x1x1x16x64xf32>) + return + } + + // Case 1: f32 32x32 (1024 elements) + + func.func @TMAX_f32_32x32(%a_ptr: !pto.ptr, %b_ptr: !pto.ptr, %c_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c32 = arith.constant 32 : index + %c1024 = arith.constant 1024 : index + + %a_view = pto.make_tensor_view %a_ptr, + shape = [%c1, %c1, %c1, %c32, %c32], + strides = [%c1024, %c1024, %c1024, %c32, %c1] + : !pto.tensor_view<1x1x1x32x32xf32> + %b_view = pto.make_tensor_view %b_ptr, + shape = [%c1, %c1, %c1, %c32, %c32], + strides = [%c1024, %c1024, %c1024, %c32, %c1] + : !pto.tensor_view<1x1x1x32x32xf32> + %c_view = pto.make_tensor_view %c_ptr, + shape = [%c1, %c1, %c1, %c32, %c32], + strides = [%c1024, %c1024, %c1024, %c32, %c1] + : !pto.tensor_view<1x1x1x32x32xf32> + + %a_part = pto.partition_view %a_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c32, %c32] + : !pto.tensor_view<1x1x1x32x32xf32> -> !pto.partition_tensor_view<1x1x1x32x32xf32> + %b_part = pto.partition_view %b_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c32, %c32] + : !pto.tensor_view<1x1x1x32x32xf32> -> !pto.partition_tensor_view<1x1x1x32x32xf32> + %c_part = pto.partition_view %c_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c32, %c32] + : !pto.tensor_view<1x1x1x32x32xf32> -> !pto.partition_tensor_view<1x1x1x32x32xf32> + + %a = pto.alloc_tile + : !pto.tile_buf + %b = pto.alloc_tile + : !pto.tile_buf + %c = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%a_part : !pto.partition_tensor_view<1x1x1x32x32xf32>) + outs(%a : !pto.tile_buf) + pto.tload ins(%b_part : !pto.partition_tensor_view<1x1x1x32x32xf32>) + outs(%b : !pto.tile_buf) + + pto.tmax ins(%a, %b : !pto.tile_buf, + !pto.tile_buf) + outs(%c : !pto.tile_buf) + + pto.tstore ins(%c : !pto.tile_buf) + outs(%c_part : !pto.partition_tensor_view<1x1x1x32x32xf32>) + return + } +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmaxs/CMakeLists.txt b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmaxs/CMakeLists.txt new file mode 100644 index 000000000..a540c4c13 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmaxs/CMakeLists.txt @@ -0,0 +1,9 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +pto_tilelang_vec_st(tmaxs) diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmaxs/cases.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmaxs/cases.py new file mode 100644 index 000000000..036dc8bdb --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmaxs/cases.py @@ -0,0 +1,29 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +"""Single source of truth for tmaxs ST test cases.""" + +import numpy as np + +CASES = [ + {"name": "f32_32x64", "dtype": np.float32, "shape": (32, 64), "valid_shape": (32, 64), "eps": 1e-6}, + {"name": "f16_63x64", "dtype": np.float16, "shape": (63, 64), "valid_shape": (63, 64), "eps": 1e-3}, + {"name": "i32_31x128", "dtype": np.int32, "shape": (31, 128), "valid_shape": (31, 128), "eps": 0}, + {"name": "i16_15x192", "dtype": np.int16, "shape": (15, 192), "valid_shape": (15, 192), "eps": 0}, + {"name": "f32_7x448", "dtype": np.float32, "shape": (7, 448), "valid_shape": (7, 448), "eps": 1e-6}, + {"name": "f32_256x16", "dtype": np.float32, "shape": (256, 16), "valid_shape": (256, 16), "eps": 1e-6}, +] + +_SMOKE_CASE_NAMES = ['f32_32x64', 'i16_15x192'] +_SMOKE_CASE_NAME_SET = set(_SMOKE_CASE_NAMES) +_missing = [name for name in _SMOKE_CASE_NAMES if name not in {case["name"] for case in CASES}] +if _missing: + raise RuntimeError("unknown smoke case(s): " + ", ".join(_missing)) +CASES = [case for case in CASES if case["name"] in _SMOKE_CASE_NAME_SET] diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmaxs/compare.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmaxs/compare.py new file mode 100644 index 000000000..50186777e --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmaxs/compare.py @@ -0,0 +1,46 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import os +import sys +import numpy as np + +from cases import CASES +from st_common import result_cmp, style_fail, style_pass, validate_cases + +def main(): + validate_cases(CASES) + case_filter = sys.argv[1] if len(sys.argv) > 1 else None + + all_passed = True + for case in CASES: + if case_filter is not None and case["name"] != case_filter: + continue + + case_dir = case["name"] + shape = case["shape"] + vr, vc = case["valid_shape"] + + golden = np.fromfile(os.path.join(case_dir, "golden.bin"), dtype=case["dtype"]).reshape(shape) + output = np.fromfile(os.path.join(case_dir, "output.bin"), dtype=case["dtype"]).reshape(shape) + + ok = result_cmp(golden[:vr, :vc], output[:vr, :vc], case["eps"]) + if ok: + print(style_pass(f"[INFO] {case['name']}: compare passed")) + else: + print(style_fail(f"[ERROR] {case['name']}: compare failed")) + all_passed = False + + if not all_passed: + sys.exit(2) + print(style_pass("[INFO] all cases passed")) + +if __name__ == "__main__": + main() diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmaxs/gen_data.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmaxs/gen_data.py new file mode 100644 index 000000000..10520c68b --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmaxs/gen_data.py @@ -0,0 +1,35 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import numpy as np +from cases import CASES +from st_common import validate_cases, setup_case_rng, save_case_data + +# Scalar value used for element-wise maximum (matches the scalar passed in launch.cpp) +SCALAR = 5.0 + +validate_cases(CASES) + +for case in CASES: + setup_case_rng(case) + + dtype = case["dtype"] + shape = case["shape"] + valid_shape = case["valid_shape"] + + input1 = np.random.randint(1, 10, size=shape).astype(dtype) + + golden = np.zeros(shape, dtype=dtype) + vr, vc = valid_shape + scalar_val = dtype(SCALAR) + golden[:vr, :vc] = np.maximum(input1[:vr, :vc], scalar_val).astype(dtype, copy=False) + + save_case_data(case["name"], {"input1": input1, "golden": golden}) + print(f"[INFO] gen_data: {case['name']} shape={shape} valid_shape={valid_shape} dtype={dtype.__name__} scalar={SCALAR}") diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmaxs/launch.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmaxs/launch.cpp new file mode 100644 index 000000000..5445ed673 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmaxs/launch.cpp @@ -0,0 +1,31 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +#include + +#ifndef AICORE +#define AICORE [aicore] +#endif + +// Scalar value used for element-wise maximum (must match gen_data.py SCALAR) +static constexpr float TMAXS_SCALAR_F32 = 5.0f; + +// Case 0: f32 32x64 + +extern "C" __global__ AICORE void TMAXS_f32_32x64(__gm__ float *src, __gm__ float *dst, float scalar); +extern "C" __global__ AICORE void TMAXS_i16_15x192(__gm__ int16_t *src, __gm__ int16_t *dst, int16_t scalar); + +void LaunchTMAXS_f32_32x64(float *src, float *dst, void *stream) { + TMAXS_f32_32x64<<<1, nullptr, stream>>>((__gm__ float *)src, (__gm__ float *)dst, TMAXS_SCALAR_F32); +} + + + +void LaunchTMAXS_i16_15x192(int16_t *src, int16_t *dst, void *stream) { + TMAXS_i16_15x192<<<1, nullptr, stream>>>((__gm__ int16_t *)src, (__gm__ int16_t *)dst, (int16_t)5); +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmaxs/main.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmaxs/main.cpp new file mode 100644 index 000000000..8fff55c75 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmaxs/main.cpp @@ -0,0 +1,133 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// Host driver for TileLang tmaxs ST — case-table driven. +// tmaxs: dst = max(src, scalar) (single input + scalar). +// Numerical comparison is done externally by compare.py. + +#include "acl/acl.h" +#include "test_common.h" +#include +#include +#include +#include +#include +#include + +using namespace PtoTestCommon; + +// Kernel launch wrappers (defined in launch.cpp) +void LaunchTMAXS_f32_32x64(float *src, float *dst, void *stream); +void LaunchTMAXS_f16_63x64(uint16_t *src, uint16_t *dst, void *stream); +void LaunchTMAXS_i16_15x192(int16_t *src, int16_t *dst, void *stream); +void LaunchTMAXS_f32_256x16(float *src, float *dst, void *stream); + +struct TestCase { + const char *name; + void (*launch)(void *, void *, void *); // src, dst, stream + size_t rows; // allocated tile rows + size_t cols; // allocated tile cols + size_t validRows; // effective computation rows (<= rows) + size_t validCols; // effective computation cols (<= cols) + size_t elemSize; // bytes per element +}; + +static const TestCase kCases[] = { +{"f32_32x64", (void (*)(void*,void*,void*))LaunchTMAXS_f32_32x64, 32, 64, 32, 64, sizeof(float)}, +{"i16_15x192", (void (*)(void*,void*,void*))LaunchTMAXS_i16_15x192, 15, 192, 15, 192, sizeof(int16_t)}, +}; +static constexpr size_t kNumCases = sizeof(kCases) / sizeof(kCases[0]); + +static int RunCase(const TestCase &tc, int deviceId, aclrtStream stream) { + int rc = 0; + const size_t elemCount = tc.rows * tc.cols; + const size_t fileSize = elemCount * tc.elemSize; + + std::printf("[INFO] === case: %s (shape=%zux%zu, valid=%zux%zu) ===\n", + tc.name, tc.rows, tc.cols, tc.validRows, tc.validCols); + + // Per-case data directory + std::string caseDir = std::string("./") + tc.name; + size_t srcFileSize = fileSize; + + void *srcHost = nullptr, *dstHost = nullptr; + void *srcDevice = nullptr, *dstDevice = nullptr; + + aclrtMallocHost(&srcHost, fileSize); + aclrtMallocHost(&dstHost, fileSize); + + aclrtMalloc(&srcDevice, fileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc(&dstDevice, fileSize, ACL_MEM_MALLOC_HUGE_FIRST); + + if (!ReadFile((caseDir + "/input1.bin").c_str(), srcFileSize, srcHost, fileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input1.bin\n", caseDir.c_str()); + rc = 1; + } + + if (rc == 0) { + aclrtMemcpy(srcDevice, fileSize, srcHost, fileSize, ACL_MEMCPY_HOST_TO_DEVICE); + + tc.launch(srcDevice, dstDevice, stream); + + aclrtSynchronizeStream(stream); + aclrtMemcpy(dstHost, fileSize, dstDevice, fileSize, ACL_MEMCPY_DEVICE_TO_HOST); + } + + if (rc == 0 && !WriteFile((caseDir + "/output.bin").c_str(), dstHost, fileSize)) { + std::fprintf(stderr, "[ERROR] failed to write %s/output.bin\n", caseDir.c_str()); + rc = 1; + } + + if (srcDevice != nullptr) + aclrtFree(srcDevice); + if (dstDevice != nullptr) + aclrtFree(dstDevice); + if (srcHost != nullptr) + aclrtFreeHost(srcHost); + if (dstHost != nullptr) + aclrtFreeHost(dstHost); + + if (rc == 0) + std::printf("[INFO] case %s done\n", tc.name); + return rc; +} + +int main(int argc, char *argv[]) { + // Optional case filter: ./tmaxs [case_name] + const char *caseFilter = (argc > 1) ? argv[1] : nullptr; + + int rc = 0; + int deviceId = 0; + aclrtStream stream = nullptr; + + aclInit(nullptr); + if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { + deviceId = std::atoi(envDevice); + } + aclrtSetDevice(deviceId); + aclrtCreateStream(&stream); + + for (size_t i = 0; i < kNumCases; ++i) { + if (caseFilter != nullptr && std::strcmp(kCases[i].name, caseFilter) != 0) { + continue; + } + int ret = RunCase(kCases[i], deviceId, stream); + if (ret != 0) { + std::fprintf(stderr, "[ERROR] case %s failed\n", kCases[i].name); + rc = 1; + break; + } + } + + if (stream != nullptr) + aclrtDestroyStream(stream); + aclrtResetDevice(deviceId); + aclFinalize(); + + return rc; +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmaxs/tmaxs.pto b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmaxs/tmaxs.pto new file mode 100644 index 000000000..c20df4669 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmaxs/tmaxs.pto @@ -0,0 +1,98 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// TileLang ST kernels for pto.tmaxs: tload(src) + tmaxs(src, scalar)->dst + tstore(dst). +// Multiple cases with different shapes/dtypes in a single module. +// Compiled by ptoas --enable-insert-sync --enable-tile-op-expand --pto-backend=vpto +// to produce a fatobj object. + +module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { + + // Case 0: f32 32x64 (2048 elements) + func.func @TMAXS_f32_32x64(%src_ptr: !pto.ptr, %dst_ptr: !pto.ptr, %scalar: f32) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c32 = arith.constant 32 : index + %c64 = arith.constant 64 : index + %c2048 = arith.constant 2048 : index + + %src_view = pto.make_tensor_view %src_ptr, + shape = [%c1, %c1, %c1, %c32, %c64], + strides = [%c2048, %c2048, %c2048, %c64, %c1] + : !pto.tensor_view<1x1x1x32x64xf32> + %dst_view = pto.make_tensor_view %dst_ptr, + shape = [%c1, %c1, %c1, %c32, %c64], + strides = [%c2048, %c2048, %c2048, %c64, %c1] + : !pto.tensor_view<1x1x1x32x64xf32> + + %src_part = pto.partition_view %src_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c32, %c64] + : !pto.tensor_view<1x1x1x32x64xf32> -> !pto.partition_tensor_view<1x1x1x32x64xf32> + %dst_part = pto.partition_view %dst_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c32, %c64] + : !pto.tensor_view<1x1x1x32x64xf32> -> !pto.partition_tensor_view<1x1x1x32x64xf32> + + %src = pto.alloc_tile + : !pto.tile_buf + %dst = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%src_part : !pto.partition_tensor_view<1x1x1x32x64xf32>) + outs(%src : !pto.tile_buf) + pto.tmaxs ins(%src, %scalar : !pto.tile_buf, f32) + outs(%dst : !pto.tile_buf) + pto.tstore ins(%dst : !pto.tile_buf) + outs(%dst_part : !pto.partition_tensor_view<1x1x1x32x64xf32>) + return + } + + // Case 1: f16 63x64 (4032 elements) + + func.func @TMAXS_i16_15x192(%src_ptr: !pto.ptr, %dst_ptr: !pto.ptr, %scalar: i16) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c15 = arith.constant 15 : index + %c192 = arith.constant 192 : index + %c2880 = arith.constant 2880 : index + + %src_view = pto.make_tensor_view %src_ptr, + shape = [%c1, %c1, %c1, %c15, %c192], + strides = [%c2880, %c2880, %c2880, %c192, %c1] + : !pto.tensor_view<1x1x1x15x192xi16> + %dst_view = pto.make_tensor_view %dst_ptr, + shape = [%c1, %c1, %c1, %c15, %c192], + strides = [%c2880, %c2880, %c2880, %c192, %c1] + : !pto.tensor_view<1x1x1x15x192xi16> + + %src_part = pto.partition_view %src_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c15, %c192] + : !pto.tensor_view<1x1x1x15x192xi16> -> !pto.partition_tensor_view<1x1x1x15x192xi16> + %dst_part = pto.partition_view %dst_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c15, %c192] + : !pto.tensor_view<1x1x1x15x192xi16> -> !pto.partition_tensor_view<1x1x1x15x192xi16> + + %src = pto.alloc_tile + : !pto.tile_buf + %dst = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%src_part : !pto.partition_tensor_view<1x1x1x15x192xi16>) + outs(%src : !pto.tile_buf) + pto.tmaxs ins(%src, %scalar : !pto.tile_buf, i16) + outs(%dst : !pto.tile_buf) + pto.tstore ins(%dst : !pto.tile_buf) + outs(%dst_part : !pto.partition_tensor_view<1x1x1x15x192xi16>) + return + } + + // Case 4: f32 7x448 (3136 elements) +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmin/CMakeLists.txt b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmin/CMakeLists.txt new file mode 100644 index 000000000..92d8cd83a --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmin/CMakeLists.txt @@ -0,0 +1,9 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +pto_tilelang_vec_st(tmin) diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmin/cases.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmin/cases.py new file mode 100644 index 000000000..bbad7d338 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmin/cases.py @@ -0,0 +1,96 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +"""Single source of truth for tmin ST test cases. + +Each case defines: + - name: case identifier, used as subdirectory name and by main.cpp kCases[]. + - dtype: numpy dtype (e.g. np.float32). + - shape: (rows, cols) — allocated tile dimensions. + - valid_shape: (valid_rows, valid_cols) — effective computation region. + - eps: tolerance for numpy.allclose (atol and rtol). + +gen_data.py and compare.py both import this list to avoid redundant definitions. +""" + +import numpy as np + +CASES = [ + { + "name": "f32_32x32", + "dtype": np.float32, + "shape": (32, 32), + "valid_shape": (32, 32), + "eps": 1e-6, + }, + { + "name": "f32_64x64", + "dtype": np.float32, + "shape": (64, 64), + "valid_shape": (64, 64), + "eps": 1e-6, + }, + { + "name": "i32_64x64", + "dtype": np.int32, + "shape": (64, 64), + "valid_shape": (64, 64), + "eps": 0, + }, + { + "name": "i16_64x64", + "dtype": np.int16, + "shape": (64, 64), + "valid_shape": (64, 64), + "eps": 0, + }, + { + "name": "f16_64x64", + "dtype": np.float16, + "shape": (64, 64), + "valid_shape": (64, 64), + "eps": 1e-3, + }, + { + "name": "f32_64x64_v60x60", + "dtype": np.float32, + "shape": (64, 64), + "valid_shape": (60, 60), + "eps": 1e-6, + }, + { + "name": "i32_64x64_v60x60", + "dtype": np.int32, + "shape": (64, 64), + "valid_shape": (60, 60), + "eps": 0, + }, + { + "name": "f16_2x4096_v1x3600", + "dtype": np.float16, + "shape": (2, 4096), + "valid_shape": (1, 3600), + "eps": 1e-3, + }, + { + "name": "i16_20x512_v16x200", + "dtype": np.int16, + "shape": (20, 512), + "valid_shape": (16, 200), + "eps": 0, + }, +] + +_SMOKE_CASE_NAMES = ['f32_32x32', 'i32_64x64_v60x60'] +_SMOKE_CASE_NAME_SET = set(_SMOKE_CASE_NAMES) +_missing = [name for name in _SMOKE_CASE_NAMES if name not in {case["name"] for case in CASES}] +if _missing: + raise RuntimeError("unknown smoke case(s): " + ", ".join(_missing)) +CASES = [case for case in CASES if case["name"] in _SMOKE_CASE_NAME_SET] diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmin/compare.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmin/compare.py new file mode 100644 index 000000000..6a4d5d1aa --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmin/compare.py @@ -0,0 +1,48 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import os +import sys +import numpy as np + +from cases import CASES +from st_common import result_cmp, style_fail, style_pass, validate_cases + + +def main(): + validate_cases(CASES) + case_filter = sys.argv[1] if len(sys.argv) > 1 else None + + all_passed = True + for case in CASES: + if case_filter is not None and case["name"] != case_filter: + continue + + case_dir = case["name"] + shape = case["shape"] + vr, vc = case["valid_shape"] + + golden = np.fromfile(os.path.join(case_dir, "golden.bin"), dtype=case["dtype"]).reshape(shape) + output = np.fromfile(os.path.join(case_dir, "output.bin"), dtype=case["dtype"]).reshape(shape) + + ok = result_cmp(golden[:vr, :vc], output[:vr, :vc], case["eps"]) + if ok: + print(style_pass(f"[INFO] {case['name']}: compare passed")) + else: + print(style_fail(f"[ERROR] {case['name']}: compare failed")) + all_passed = False + + if not all_passed: + sys.exit(2) + print(style_pass("[INFO] all cases passed")) + + +if __name__ == "__main__": + main() diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmin/gen_data.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmin/gen_data.py new file mode 100644 index 000000000..d03def900 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmin/gen_data.py @@ -0,0 +1,32 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import numpy as np +from cases import CASES +from st_common import validate_cases, setup_case_rng, save_case_data + +validate_cases(CASES) + +for case in CASES: + setup_case_rng(case) + + dtype = case["dtype"] + shape = case["shape"] + valid_shape = case["valid_shape"] + + input1 = np.random.randint(1, 10, size=shape).astype(dtype) + input2 = np.random.randint(1, 10, size=shape).astype(dtype) + + golden = np.zeros(shape, dtype=dtype) + vr, vc = valid_shape + golden[:vr, :vc] = np.minimum(input1[:vr, :vc], input2[:vr, :vc]).astype(dtype, copy=False) + + save_case_data(case["name"], {"input1": input1, "input2": input2, "golden": golden}) + print(f"[INFO] gen_data: {case['name']} shape={shape} valid_shape={valid_shape} dtype={dtype.__name__}") diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmin/launch.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmin/launch.cpp new file mode 100644 index 000000000..55ee157a7 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmin/launch.cpp @@ -0,0 +1,28 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +#include + +#ifndef AICORE +#define AICORE [aicore] +#endif + +// Smoke case: f32 32x32 + +extern "C" __global__ AICORE void TMIN_f32_32x32(__gm__ float *a, __gm__ float *b, __gm__ float *c); +extern "C" __global__ AICORE void TMIN_i32_64x64_v60x60(__gm__ int32_t *a, __gm__ int32_t *b, __gm__ int32_t *c); + +void LaunchTMIN_i32_64x64_v60x60(void *a, void *b, void *c, void *stream) { + TMIN_i32_64x64_v60x60<<<1, nullptr, stream>>>((__gm__ int32_t *)a, (__gm__ int32_t *)b, (__gm__ int32_t *)c); +} + + + +void LaunchTMIN_f32_32x32(void *a, void *b, void *c, void *stream) { + TMIN_f32_32x32<<<1, nullptr, stream>>>((__gm__ float *)a, (__gm__ float *)b, (__gm__ float *)c); +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmin/main.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmin/main.cpp new file mode 100644 index 000000000..34ea815f7 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmin/main.cpp @@ -0,0 +1,150 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// Host driver for TileLang tand ST — case-table driven. +// Each case launches a different kernel variant, reads/writes from per-case subdirectory. +// Numerical comparison is done externally by compare.py. + +#include "acl/acl.h" +#include "test_common.h" +#include +#include +#include +#include +#include +#include + +using namespace PtoTestCommon; + +// Kernel launch wrappers (defined in launch.cpp) +void LaunchTMIN_f32_32x32(void *a, void *b, void *c, void *stream); +void LaunchTMIN_f32_64x64(void *a, void *b, void *c, void *stream); +void LaunchTMIN_i16_64x64(void *a, void *b, void *c, void *stream); +void LaunchTMIN_f32_64x64_v60x60(void *a, void *b, void *c, void *stream); +void LaunchTMIN_i32_64x64_v60x60(void *a, void *b, void *c, void *stream); +void LaunchTMIN_f16_2x4096_v1x3600(void *a, void *b, void *c, void *stream); + +using LaunchFn = void (*)(void *, void *, void *, void *); + +struct TestCase { + const char *name; + LaunchFn launch; + size_t rows; // allocated tile rows + size_t cols; // allocated tile cols + size_t validRows; // effective computation rows (<= rows) + size_t validCols; // effective computation cols (<= cols) + size_t elemSize; // bytes per element +}; + +static const TestCase kCases[] = { +{"f32_32x32", LaunchTMIN_f32_32x32, 32, 32, 32, 32, sizeof(float)}, +{"i32_64x64_v60x60", LaunchTMIN_i32_64x64_v60x60, 64, 64, 60, 60, sizeof(int32_t)}, +}; +static constexpr size_t kNumCases = sizeof(kCases) / sizeof(kCases[0]); + +static int RunCase(const TestCase &tc, int deviceId, aclrtStream stream) { + (void)deviceId; + int rc = 0; + const size_t elemCount = tc.rows * tc.cols; + const size_t fileSize = elemCount * tc.elemSize; + + std::printf("[INFO] === case: %s (shape=%zux%zu, valid=%zux%zu) ===\n", + tc.name, tc.rows, tc.cols, tc.validRows, tc.validCols); + + // Per-case data directory + std::string caseDir = std::string("./") + tc.name; + size_t src0FileSize = fileSize; + size_t src1FileSize = fileSize; + + void *src0Host = nullptr, *src1Host = nullptr, *dstHost = nullptr; + void *src0Device = nullptr, *src1Device = nullptr, *dstDevice = nullptr; + + aclrtMallocHost(&src0Host, fileSize); + aclrtMallocHost(&src1Host, fileSize); + aclrtMallocHost(&dstHost, fileSize); + + aclrtMalloc(&src0Device, fileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc(&src1Device, fileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc(&dstDevice, fileSize, ACL_MEM_MALLOC_HUGE_FIRST); + + if (!ReadFile((caseDir + "/input1.bin").c_str(), src0FileSize, src0Host, fileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input1.bin\n", caseDir.c_str()); + rc = 1; + } + if (rc == 0 && !ReadFile((caseDir + "/input2.bin").c_str(), src1FileSize, src1Host, fileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input2.bin\n", caseDir.c_str()); + rc = 1; + } + + if (rc == 0) { + aclrtMemcpy(src0Device, fileSize, src0Host, fileSize, ACL_MEMCPY_HOST_TO_DEVICE); + aclrtMemcpy(src1Device, fileSize, src1Host, fileSize, ACL_MEMCPY_HOST_TO_DEVICE); + + tc.launch(src0Device, src1Device, dstDevice, stream); + + aclrtSynchronizeStream(stream); + aclrtMemcpy(dstHost, fileSize, dstDevice, fileSize, ACL_MEMCPY_DEVICE_TO_HOST); + } + + if (rc == 0 && !WriteFile((caseDir + "/output.bin").c_str(), dstHost, fileSize)) { + std::fprintf(stderr, "[ERROR] failed to write %s/output.bin\n", caseDir.c_str()); + rc = 1; + } + + if (src0Device != nullptr) + aclrtFree(src0Device); + if (src1Device != nullptr) + aclrtFree(src1Device); + if (dstDevice != nullptr) + aclrtFree(dstDevice); + if (src0Host != nullptr) + aclrtFreeHost(src0Host); + if (src1Host != nullptr) + aclrtFreeHost(src1Host); + if (dstHost != nullptr) + aclrtFreeHost(dstHost); + + if (rc == 0) + std::printf("[INFO] case %s done\n", tc.name); + return rc; +} + +int main(int argc, char *argv[]) { + // Optional case filter: ./tmin [case_name] + const char *caseFilter = (argc > 1) ? argv[1] : nullptr; + + int rc = 0; + int deviceId = 0; + aclrtStream stream = nullptr; + + aclInit(nullptr); + if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { + deviceId = std::atoi(envDevice); + } + aclrtSetDevice(deviceId); + aclrtCreateStream(&stream); + + for (size_t i = 0; i < kNumCases; ++i) { + if (caseFilter != nullptr && std::strcmp(kCases[i].name, caseFilter) != 0) { + continue; + } + int ret = RunCase(kCases[i], deviceId, stream); + if (ret != 0) { + std::fprintf(stderr, "[ERROR] case %s failed\n", kCases[i].name); + rc = 1; + break; + } + } + + if (stream != nullptr) + aclrtDestroyStream(stream); + aclrtResetDevice(deviceId); + aclFinalize(); + + return rc; +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmin/tmin.pto b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmin/tmin.pto new file mode 100644 index 000000000..60049b782 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmin/tmin.pto @@ -0,0 +1,126 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// TileLang ST kernels for pto.tmin: tload(a) + tload(b) + tmin(a,b)->c + tstore(c). +// Multiple cases with different shapes in a single module. +// Compiled by ptoas --enable-insert-sync --enable-tile-op-expand --pto-backend=vpto +// to produce a fatobj object. + +module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { + // Smoke case: f32 32x32 (1024 elements) + func.func @TMIN_f32_32x32(%a_ptr: !pto.ptr, %b_ptr: !pto.ptr, %c_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c32 = arith.constant 32 : index + %c1024 = arith.constant 1024 : index + + %a_view = pto.make_tensor_view %a_ptr, + shape = [%c1, %c1, %c1, %c32, %c32], + strides = [%c1024, %c1024, %c1024, %c32, %c1] + : !pto.tensor_view<1x1x1x32x32xf32> + %b_view = pto.make_tensor_view %b_ptr, + shape = [%c1, %c1, %c1, %c32, %c32], + strides = [%c1024, %c1024, %c1024, %c32, %c1] + : !pto.tensor_view<1x1x1x32x32xf32> + %c_view = pto.make_tensor_view %c_ptr, + shape = [%c1, %c1, %c1, %c32, %c32], + strides = [%c1024, %c1024, %c1024, %c32, %c1] + : !pto.tensor_view<1x1x1x32x32xf32> + + %a_part = pto.partition_view %a_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c32, %c32] + : !pto.tensor_view<1x1x1x32x32xf32> -> !pto.partition_tensor_view<1x1x1x32x32xf32> + %b_part = pto.partition_view %b_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c32, %c32] + : !pto.tensor_view<1x1x1x32x32xf32> -> !pto.partition_tensor_view<1x1x1x32x32xf32> + %c_part = pto.partition_view %c_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c32, %c32] + : !pto.tensor_view<1x1x1x32x32xf32> -> !pto.partition_tensor_view<1x1x1x32x32xf32> + + %a = pto.alloc_tile + : !pto.tile_buf + %b = pto.alloc_tile + : !pto.tile_buf + %c = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%a_part : !pto.partition_tensor_view<1x1x1x32x32xf32>) + outs(%a : !pto.tile_buf) + pto.tload ins(%b_part : !pto.partition_tensor_view<1x1x1x32x32xf32>) + outs(%b : !pto.tile_buf) + + pto.tmin ins(%a, %b : !pto.tile_buf, + !pto.tile_buf) + outs(%c : !pto.tile_buf) + + pto.tstore ins(%c : !pto.tile_buf) + outs(%c_part : !pto.partition_tensor_view<1x1x1x32x32xf32>) + return + } + + // Case 0: f32 64x64 (4096 elements) + + func.func @TMIN_i32_64x64_v60x60(%a_ptr: !pto.ptr, %b_ptr: !pto.ptr, %c_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c60 = arith.constant 60 : index + %c64 = arith.constant 64 : index + %c3600 = arith.constant 3600 : index + + %a_view = pto.make_tensor_view %a_ptr, + shape = [%c1, %c1, %c1, %c60, %c60], + strides = [%c3600, %c3600, %c3600, %c64, %c1] + : !pto.tensor_view<1x1x1x60x60xi32> + %b_view = pto.make_tensor_view %b_ptr, + shape = [%c1, %c1, %c1, %c60, %c60], + strides = [%c3600, %c3600, %c3600, %c64, %c1] + : !pto.tensor_view<1x1x1x60x60xi32> + %c_view = pto.make_tensor_view %c_ptr, + shape = [%c1, %c1, %c1, %c60, %c60], + strides = [%c3600, %c3600, %c3600, %c64, %c1] + : !pto.tensor_view<1x1x1x60x60xi32> + + %a_part = pto.partition_view %a_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c60, %c60] + : !pto.tensor_view<1x1x1x60x60xi32> -> !pto.partition_tensor_view<1x1x1x60x60xi32> + %b_part = pto.partition_view %b_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c60, %c60] + : !pto.tensor_view<1x1x1x60x60xi32> -> !pto.partition_tensor_view<1x1x1x60x60xi32> + %c_part = pto.partition_view %c_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c60, %c60] + : !pto.tensor_view<1x1x1x60x60xi32> -> !pto.partition_tensor_view<1x1x1x60x60xi32> + + %a = pto.alloc_tile + : !pto.tile_buf + %b = pto.alloc_tile + : !pto.tile_buf + %c = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%a_part : !pto.partition_tensor_view<1x1x1x60x60xi32>) + outs(%a : !pto.tile_buf) + pto.tload ins(%b_part : !pto.partition_tensor_view<1x1x1x60x60xi32>) + outs(%b : !pto.tile_buf) + + pto.tmin ins(%a, %b : !pto.tile_buf, + !pto.tile_buf) + outs(%c : !pto.tile_buf) + + pto.tstore ins(%c : !pto.tile_buf) + outs(%c_part : !pto.partition_tensor_view<1x1x1x60x60xi32>) + return + } + + // Case 6: f16 2x4096 tile with 1x3600 valid region (padding with MAX for tmin) +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmins/CMakeLists.txt b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmins/CMakeLists.txt new file mode 100644 index 000000000..038d4e327 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmins/CMakeLists.txt @@ -0,0 +1,9 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +pto_tilelang_vec_st(tmins) diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmins/cases.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmins/cases.py new file mode 100644 index 000000000..0ff22b84d --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmins/cases.py @@ -0,0 +1,29 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +"""Single source of truth for tmins ST test cases.""" + +import numpy as np + +CASES = [ + {"name": "f32_32x64", "dtype": np.float32, "shape": (32, 64), "valid_shape": (32, 64), "eps": 1e-6}, + {"name": "f16_63x64", "dtype": np.float16, "shape": (63, 64), "valid_shape": (63, 64), "eps": 1e-3}, + {"name": "i32_31x128", "dtype": np.int32, "shape": (31, 128), "valid_shape": (31, 128), "eps": 0}, + {"name": "i16_15x192", "dtype": np.int16, "shape": (15, 192), "valid_shape": (15, 192), "eps": 0}, + {"name": "f32_7x448", "dtype": np.float32, "shape": (7, 448), "valid_shape": (7, 448), "eps": 1e-6}, + {"name": "f32_256x16", "dtype": np.float32, "shape": (256, 16), "valid_shape": (256, 16), "eps": 1e-6}, +] + +_SMOKE_CASE_NAMES = ['f32_32x64', 'i16_15x192'] +_SMOKE_CASE_NAME_SET = set(_SMOKE_CASE_NAMES) +_missing = [name for name in _SMOKE_CASE_NAMES if name not in {case["name"] for case in CASES}] +if _missing: + raise RuntimeError("unknown smoke case(s): " + ", ".join(_missing)) +CASES = [case for case in CASES if case["name"] in _SMOKE_CASE_NAME_SET] diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmins/compare.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmins/compare.py new file mode 100644 index 000000000..50186777e --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmins/compare.py @@ -0,0 +1,46 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import os +import sys +import numpy as np + +from cases import CASES +from st_common import result_cmp, style_fail, style_pass, validate_cases + +def main(): + validate_cases(CASES) + case_filter = sys.argv[1] if len(sys.argv) > 1 else None + + all_passed = True + for case in CASES: + if case_filter is not None and case["name"] != case_filter: + continue + + case_dir = case["name"] + shape = case["shape"] + vr, vc = case["valid_shape"] + + golden = np.fromfile(os.path.join(case_dir, "golden.bin"), dtype=case["dtype"]).reshape(shape) + output = np.fromfile(os.path.join(case_dir, "output.bin"), dtype=case["dtype"]).reshape(shape) + + ok = result_cmp(golden[:vr, :vc], output[:vr, :vc], case["eps"]) + if ok: + print(style_pass(f"[INFO] {case['name']}: compare passed")) + else: + print(style_fail(f"[ERROR] {case['name']}: compare failed")) + all_passed = False + + if not all_passed: + sys.exit(2) + print(style_pass("[INFO] all cases passed")) + +if __name__ == "__main__": + main() diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmins/gen_data.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmins/gen_data.py new file mode 100644 index 000000000..84da39655 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmins/gen_data.py @@ -0,0 +1,35 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import numpy as np +from cases import CASES +from st_common import validate_cases, setup_case_rng, save_case_data + +# Scalar value used for element-wise minimum (matches the scalar passed in launch.cpp) +SCALAR = 5.0 + +validate_cases(CASES) + +for case in CASES: + setup_case_rng(case) + + dtype = case["dtype"] + shape = case["shape"] + valid_shape = case["valid_shape"] + + input1 = np.random.randint(1, 10, size=shape).astype(dtype) + + golden = np.zeros(shape, dtype=dtype) + vr, vc = valid_shape + scalar_val = dtype(SCALAR) + golden[:vr, :vc] = np.minimum(input1[:vr, :vc], scalar_val).astype(dtype, copy=False) + + save_case_data(case["name"], {"input1": input1, "golden": golden}) + print(f"[INFO] gen_data: {case['name']} shape={shape} valid_shape={valid_shape} dtype={dtype.__name__} scalar={SCALAR}") diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmins/launch.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmins/launch.cpp new file mode 100644 index 000000000..e4ad1ad85 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmins/launch.cpp @@ -0,0 +1,31 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +#include + +#ifndef AICORE +#define AICORE [aicore] +#endif + +// Scalar value used for element-wise minimum (must match gen_data.py SCALAR) +static constexpr float TMINS_SCALAR_F32 = 5.0f; + +// Case 0: f32 32x64 + +extern "C" __global__ AICORE void TMINS_f32_32x64(__gm__ float *src, __gm__ float *dst, float scalar); +extern "C" __global__ AICORE void TMINS_i16_15x192(__gm__ int16_t *src, __gm__ int16_t *dst, int16_t scalar); + +void LaunchTMINS_i16_15x192(int16_t *src, int16_t *dst, void *stream) { + TMINS_i16_15x192<<<1, nullptr, stream>>>((__gm__ int16_t *)src, (__gm__ int16_t *)dst, (int16_t)5); +} + + + +void LaunchTMINS_f32_32x64(float *src, float *dst, void *stream) { + TMINS_f32_32x64<<<1, nullptr, stream>>>((__gm__ float *)src, (__gm__ float *)dst, TMINS_SCALAR_F32); +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmins/main.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmins/main.cpp new file mode 100644 index 000000000..826d56674 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmins/main.cpp @@ -0,0 +1,133 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// Host driver for TileLang tmins ST — case-table driven. +// tmins: dst = min(src, scalar) (single input + scalar). +// Numerical comparison is done externally by compare.py. + +#include "acl/acl.h" +#include "test_common.h" +#include +#include +#include +#include +#include +#include + +using namespace PtoTestCommon; + +// Kernel launch wrappers (defined in launch.cpp) +void LaunchTMINS_f32_32x64(float *src, float *dst, void *stream); +void LaunchTMINS_f16_63x64(uint16_t *src, uint16_t *dst, void *stream); +void LaunchTMINS_i16_15x192(int16_t *src, int16_t *dst, void *stream); +void LaunchTMINS_f32_256x16(float *src, float *dst, void *stream); + +struct TestCase { + const char *name; + void (*launch)(void *, void *, void *); // src, dst, stream + size_t rows; // allocated tile rows + size_t cols; // allocated tile cols + size_t validRows; // effective computation rows (<= rows) + size_t validCols; // effective computation cols (<= cols) + size_t elemSize; // bytes per element +}; + +static const TestCase kCases[] = { +{"f32_32x64", (void (*)(void*,void*,void*))LaunchTMINS_f32_32x64, 32, 64, 32, 64, sizeof(float)}, +{"i16_15x192", (void (*)(void*,void*,void*))LaunchTMINS_i16_15x192, 15, 192, 15, 192, sizeof(int16_t)}, +}; +static constexpr size_t kNumCases = sizeof(kCases) / sizeof(kCases[0]); + +static int RunCase(const TestCase &tc, int deviceId, aclrtStream stream) { + int rc = 0; + const size_t elemCount = tc.rows * tc.cols; + const size_t fileSize = elemCount * tc.elemSize; + + std::printf("[INFO] === case: %s (shape=%zux%zu, valid=%zux%zu) ===\n", + tc.name, tc.rows, tc.cols, tc.validRows, tc.validCols); + + // Per-case data directory + std::string caseDir = std::string("./") + tc.name; + size_t srcFileSize = fileSize; + + void *srcHost = nullptr, *dstHost = nullptr; + void *srcDevice = nullptr, *dstDevice = nullptr; + + aclrtMallocHost(&srcHost, fileSize); + aclrtMallocHost(&dstHost, fileSize); + + aclrtMalloc(&srcDevice, fileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc(&dstDevice, fileSize, ACL_MEM_MALLOC_HUGE_FIRST); + + if (!ReadFile((caseDir + "/input1.bin").c_str(), srcFileSize, srcHost, fileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input1.bin\n", caseDir.c_str()); + rc = 1; + } + + if (rc == 0) { + aclrtMemcpy(srcDevice, fileSize, srcHost, fileSize, ACL_MEMCPY_HOST_TO_DEVICE); + + tc.launch(srcDevice, dstDevice, stream); + + aclrtSynchronizeStream(stream); + aclrtMemcpy(dstHost, fileSize, dstDevice, fileSize, ACL_MEMCPY_DEVICE_TO_HOST); + } + + if (rc == 0 && !WriteFile((caseDir + "/output.bin").c_str(), dstHost, fileSize)) { + std::fprintf(stderr, "[ERROR] failed to write %s/output.bin\n", caseDir.c_str()); + rc = 1; + } + + if (srcDevice != nullptr) + aclrtFree(srcDevice); + if (dstDevice != nullptr) + aclrtFree(dstDevice); + if (srcHost != nullptr) + aclrtFreeHost(srcHost); + if (dstHost != nullptr) + aclrtFreeHost(dstHost); + + if (rc == 0) + std::printf("[INFO] case %s done\n", tc.name); + return rc; +} + +int main(int argc, char *argv[]) { + // Optional case filter: ./tmins [case_name] + const char *caseFilter = (argc > 1) ? argv[1] : nullptr; + + int rc = 0; + int deviceId = 0; + aclrtStream stream = nullptr; + + aclInit(nullptr); + if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { + deviceId = std::atoi(envDevice); + } + aclrtSetDevice(deviceId); + aclrtCreateStream(&stream); + + for (size_t i = 0; i < kNumCases; ++i) { + if (caseFilter != nullptr && std::strcmp(kCases[i].name, caseFilter) != 0) { + continue; + } + int ret = RunCase(kCases[i], deviceId, stream); + if (ret != 0) { + std::fprintf(stderr, "[ERROR] case %s failed\n", kCases[i].name); + rc = 1; + break; + } + } + + if (stream != nullptr) + aclrtDestroyStream(stream); + aclrtResetDevice(deviceId); + aclFinalize(); + + return rc; +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmins/tmins.pto b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmins/tmins.pto new file mode 100644 index 000000000..65bd35f57 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmins/tmins.pto @@ -0,0 +1,98 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// TileLang ST kernels for pto.tmins: tload(src) + tmins(src, scalar)->dst + tstore(dst). +// Multiple cases with different shapes/dtypes in a single module. +// Compiled by ptoas --enable-insert-sync --enable-tile-op-expand --pto-backend=vpto +// to produce a fatobj object. + +module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { + + // Case 0: f32 32x64 (2048 elements) + func.func @TMINS_f32_32x64(%src_ptr: !pto.ptr, %dst_ptr: !pto.ptr, %scalar: f32) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c32 = arith.constant 32 : index + %c64 = arith.constant 64 : index + %c2048 = arith.constant 2048 : index + + %src_view = pto.make_tensor_view %src_ptr, + shape = [%c1, %c1, %c1, %c32, %c64], + strides = [%c2048, %c2048, %c2048, %c64, %c1] + : !pto.tensor_view<1x1x1x32x64xf32> + %dst_view = pto.make_tensor_view %dst_ptr, + shape = [%c1, %c1, %c1, %c32, %c64], + strides = [%c2048, %c2048, %c2048, %c64, %c1] + : !pto.tensor_view<1x1x1x32x64xf32> + + %src_part = pto.partition_view %src_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c32, %c64] + : !pto.tensor_view<1x1x1x32x64xf32> -> !pto.partition_tensor_view<1x1x1x32x64xf32> + %dst_part = pto.partition_view %dst_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c32, %c64] + : !pto.tensor_view<1x1x1x32x64xf32> -> !pto.partition_tensor_view<1x1x1x32x64xf32> + + %src = pto.alloc_tile + : !pto.tile_buf + %dst = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%src_part : !pto.partition_tensor_view<1x1x1x32x64xf32>) + outs(%src : !pto.tile_buf) + pto.tmins ins(%src, %scalar : !pto.tile_buf, f32) + outs(%dst : !pto.tile_buf) + pto.tstore ins(%dst : !pto.tile_buf) + outs(%dst_part : !pto.partition_tensor_view<1x1x1x32x64xf32>) + return + } + + // Case 1: f16 63x64 (4032 elements) + + func.func @TMINS_i16_15x192(%src_ptr: !pto.ptr, %dst_ptr: !pto.ptr, %scalar: i16) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c15 = arith.constant 15 : index + %c192 = arith.constant 192 : index + %c2880 = arith.constant 2880 : index + + %src_view = pto.make_tensor_view %src_ptr, + shape = [%c1, %c1, %c1, %c15, %c192], + strides = [%c2880, %c2880, %c2880, %c192, %c1] + : !pto.tensor_view<1x1x1x15x192xi16> + %dst_view = pto.make_tensor_view %dst_ptr, + shape = [%c1, %c1, %c1, %c15, %c192], + strides = [%c2880, %c2880, %c2880, %c192, %c1] + : !pto.tensor_view<1x1x1x15x192xi16> + + %src_part = pto.partition_view %src_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c15, %c192] + : !pto.tensor_view<1x1x1x15x192xi16> -> !pto.partition_tensor_view<1x1x1x15x192xi16> + %dst_part = pto.partition_view %dst_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c15, %c192] + : !pto.tensor_view<1x1x1x15x192xi16> -> !pto.partition_tensor_view<1x1x1x15x192xi16> + + %src = pto.alloc_tile + : !pto.tile_buf + %dst = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%src_part : !pto.partition_tensor_view<1x1x1x15x192xi16>) + outs(%src : !pto.tile_buf) + pto.tmins ins(%src, %scalar : !pto.tile_buf, i16) + outs(%dst : !pto.tile_buf) + pto.tstore ins(%dst : !pto.tile_buf) + outs(%dst_part : !pto.partition_tensor_view<1x1x1x15x192xi16>) + return + } + + // Case 4: f32 7x448 (3136 elements) +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmov/CMakeLists.txt b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmov/CMakeLists.txt new file mode 100644 index 000000000..5c1689cec --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmov/CMakeLists.txt @@ -0,0 +1,9 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +pto_tilelang_vec_st(tmov) diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmov/cases.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmov/cases.py new file mode 100644 index 000000000..dea698da1 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmov/cases.py @@ -0,0 +1,105 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +"""Single source of truth for tmov ST test cases. + +Each case defines: + - name: case identifier, used as subdirectory name and by main.cpp kCases[]. + - dtype: numpy dtype (e.g. np.float32). + - shape: (rows, cols) — allocated tile dimensions. + - valid_shape: (valid_rows, valid_cols) — effective computation region. + - eps: tolerance for numpy.allclose (atol and rtol). + +Based on pto-isa tmov_vect test cases: + - float, half, uint8 types + - shapes: 64x64, 32x32, 128x128, 128x32, 128x64 +""" + +import numpy as np + +CASES = [ + { + "name": "f32_32x32", + "dtype": np.float32, + "shape": (32, 32), + "valid_shape": (32, 32), + "eps": 1e-6, + }, + { + "name": "f32_64x64", + "dtype": np.float32, + "shape": (64, 64), + "valid_shape": (64, 64), + "eps": 1e-6, + }, + { + "name": "f32_128x128", + "dtype": np.float32, + "shape": (128, 128), + "valid_shape": (128, 128), + "eps": 1e-6, + }, + { + "name": "f32_128x32", + "dtype": np.float32, + "shape": (128, 32), + "valid_shape": (128, 32), + "eps": 1e-6, + }, + { + "name": "f32_128x64", + "dtype": np.float32, + "shape": (128, 64), + "valid_shape": (128, 64), + "eps": 1e-6, + }, + { + "name": "f16_64x64", + "dtype": np.float16, + "shape": (64, 64), + "valid_shape": (64, 64), + "eps": 1e-3, + }, + { + "name": "f16_32x32", + "dtype": np.float16, + "shape": (32, 32), + "valid_shape": (32, 32), + "eps": 1e-3, + }, + { + "name": "f16_128x128", + "dtype": np.float16, + "shape": (128, 128), + "valid_shape": (128, 128), + "eps": 1e-3, + }, + { + "name": "u8_64x64", + "dtype": np.uint8, + "shape": (64, 64), + "valid_shape": (64, 64), + "eps": 0, + }, + { + "name": "u8_128x128", + "dtype": np.uint8, + "shape": (128, 128), + "valid_shape": (128, 128), + "eps": 0, + }, +] + +_SMOKE_CASE_NAMES = ['f32_32x32', 'f16_32x32'] +_SMOKE_CASE_NAME_SET = set(_SMOKE_CASE_NAMES) +_missing = [name for name in _SMOKE_CASE_NAMES if name not in {case["name"] for case in CASES}] +if _missing: + raise RuntimeError("unknown smoke case(s): " + ", ".join(_missing)) +CASES = [case for case in CASES if case["name"] in _SMOKE_CASE_NAME_SET] diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmov/compare.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmov/compare.py new file mode 100644 index 000000000..ab23414b1 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmov/compare.py @@ -0,0 +1,50 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +"""Compare device output against golden for tmov ST test cases.""" + +import os +import sys +import numpy as np + +from cases import CASES +from st_common import result_cmp, style_fail, style_pass, validate_cases + + +def main(): + validate_cases(CASES) + case_filter = sys.argv[1] if len(sys.argv) > 1 else None + + all_passed = True + for case in CASES: + if case_filter is not None and case["name"] != case_filter: + continue + + case_dir = case["name"] + shape = case["shape"] + vr, vc = case["valid_shape"] + + golden = np.fromfile(os.path.join(case_dir, "golden.bin"), dtype=case["dtype"]).reshape(shape) + output = np.fromfile(os.path.join(case_dir, "output.bin"), dtype=case["dtype"]).reshape(shape) + + ok = result_cmp(golden[:vr, :vc], output[:vr, :vc], case["eps"]) + if ok: + print(style_pass(f"[INFO] {case['name']}: compare passed")) + else: + print(style_fail(f"[ERROR] {case['name']}: compare failed")) + all_passed = False + + if not all_passed: + sys.exit(2) + print(style_pass("[INFO] all cases passed")) + + +if __name__ == "__main__": + main() diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmov/gen_data.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmov/gen_data.py new file mode 100644 index 000000000..06f324848 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmov/gen_data.py @@ -0,0 +1,56 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +"""Generate input and golden data for tmov ST test cases. + +For tmov (Vec-to-Vec data movement): + - input: source tile data + - golden: exact copy of source tile (valid_shape region) +""" + +import os +import sys +import numpy as np + +from cases import CASES +from st_common import validate_cases, setup_case_rng, save_case_data + + +def main(): + validate_cases(CASES) + case_filter = sys.argv[1] if len(sys.argv) > 1 else None + + for case in CASES: + if case_filter is not None and case["name"] != case_filter: + continue + + setup_case_rng(case) + + dtype = case["dtype"] + shape = case["shape"] + valid_shape = case["valid_shape"] + + # Generate random input data + if dtype == np.uint8: + input_data = np.random.randint(0, 256, size=shape).astype(dtype) + else: + input_data = np.random.rand(*shape).astype(dtype) + + # Golden is exact copy of input (valid_shape region) + golden = np.zeros(shape, dtype=dtype) + vr, vc = valid_shape + golden[:vr, :vc] = input_data[:vr, :vc].copy() + + save_case_data(case["name"], {"input": input_data, "golden": golden}) + print(f"[INFO] gen_data: {case['name']} shape={shape} valid_shape={valid_shape} dtype={dtype.__name__}") + + +if __name__ == "__main__": + main() diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmov/launch.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmov/launch.cpp new file mode 100644 index 000000000..b47eb0047 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmov/launch.cpp @@ -0,0 +1,28 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +#include + +#ifndef AICORE +#define AICORE [aicore] +#endif + +// Case 0: f32 64x64 + +extern "C" __global__ AICORE void TMOV_f32_32x32(__gm__ float *src, __gm__ float *dst); +extern "C" __global__ AICORE void TMOV_f16_32x32(__gm__ uint16_t *src, __gm__ uint16_t *dst); + +void LaunchTMOV_f32_32x32(float *src, float *dst, void *stream) { + TMOV_f32_32x32<<<1, nullptr, stream>>>((__gm__ float *)src, (__gm__ float *)dst); +} + + + +void LaunchTMOV_f16_32x32(uint16_t *src, uint16_t *dst, void *stream) { + TMOV_f16_32x32<<<1, nullptr, stream>>>((__gm__ uint16_t *)src, (__gm__ uint16_t *)dst); +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmov/main.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmov/main.cpp new file mode 100644 index 000000000..2f023ae8f --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmov/main.cpp @@ -0,0 +1,135 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// Host driver for TileLang tmov ST — case-table driven. +// Each case launches a different kernel variant, reads/writes from per-case subdirectory. +// Numerical comparison is done externally by compare.py. + +#include "acl/acl.h" +#include "test_common.h" +#include +#include +#include +#include +#include +#include + +using namespace PtoTestCommon; + +// Kernel launch wrappers (defined in launch.cpp) +void LaunchTMOV_f32_32x32(float *src, float *dst, void *stream); +void LaunchTMOV_f32_128x32(float *src, float *dst, void *stream); +void LaunchTMOV_f16_64x64(uint16_t *src, uint16_t *dst, void *stream); +void LaunchTMOV_f16_32x32(uint16_t *src, uint16_t *dst, void *stream); +void LaunchTMOV_f16_128x128(uint16_t *src, uint16_t *dst, void *stream); +void LaunchTMOV_u8_128x128(uint8_t *src, uint8_t *dst, void *stream); + +struct TestCase { + const char *name; + void (*launch)(void *src, void *dst, void *stream); + size_t rows; // allocated tile rows + size_t cols; // allocated tile cols + size_t validRows; // effective computation rows (<= rows) + size_t validCols; // effective computation cols (<= cols) + size_t elemSize; // bytes per element +}; + +static const TestCase kCases[] = { +{"f32_32x32", (void(*)(void*,void*,void*))LaunchTMOV_f32_32x32, 32, 32, 32, 32, sizeof(float)}, +{"f16_32x32", (void(*)(void*,void*,void*))LaunchTMOV_f16_32x32, 32, 32, 32, 32, sizeof(uint16_t)}, +}; +static constexpr size_t kNumCases = sizeof(kCases) / sizeof(kCases[0]); + +static int RunCase(const TestCase &tc, int deviceId, aclrtStream stream) { + int rc = 0; + const size_t elemCount = tc.rows * tc.cols; + const size_t fileSize = elemCount * tc.elemSize; + + std::printf("[INFO] === case: %s (shape=%zux%zu, valid=%zux%zu) ===\n", + tc.name, tc.rows, tc.cols, tc.validRows, tc.validCols); + + // Per-case data directory + std::string caseDir = std::string("./") + tc.name; + size_t srcFileSize = fileSize; + + void *srcHost = nullptr, *dstHost = nullptr; + void *srcDevice = nullptr, *dstDevice = nullptr; + + aclrtMallocHost((void **)(&srcHost), fileSize); + aclrtMallocHost((void **)(&dstHost), fileSize); + + aclrtMalloc((void **)&srcDevice, fileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc((void **)&dstDevice, fileSize, ACL_MEM_MALLOC_HUGE_FIRST); + + if (!ReadFile((caseDir + "/input.bin").c_str(), srcFileSize, srcHost, fileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input.bin\n", caseDir.c_str()); + rc = 1; + } + + if (rc == 0) { + aclrtMemcpy(srcDevice, fileSize, srcHost, fileSize, ACL_MEMCPY_HOST_TO_DEVICE); + + tc.launch(srcDevice, dstDevice, stream); + + aclrtSynchronizeStream(stream); + aclrtMemcpy(dstHost, fileSize, dstDevice, fileSize, ACL_MEMCPY_DEVICE_TO_HOST); + } + + if (rc == 0 && !WriteFile((caseDir + "/output.bin").c_str(), dstHost, fileSize)) { + std::fprintf(stderr, "[ERROR] failed to write %s/output.bin\n", caseDir.c_str()); + rc = 1; + } + + if (srcDevice != nullptr) + aclrtFree(srcDevice); + if (dstDevice != nullptr) + aclrtFree(dstDevice); + if (srcHost != nullptr) + aclrtFreeHost(srcHost); + if (dstHost != nullptr) + aclrtFreeHost(dstHost); + + if (rc == 0) + std::printf("[INFO] case %s done\n", tc.name); + return rc; +} + +int main(int argc, char *argv[]) { + // Optional case filter: ./tmov [case_name] + const char *caseFilter = (argc > 1) ? argv[1] : nullptr; + + int rc = 0; + int deviceId = 0; + aclrtStream stream = nullptr; + + aclInit(nullptr); + if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { + deviceId = std::atoi(envDevice); + } + aclrtSetDevice(deviceId); + aclrtCreateStream(&stream); + + for (size_t i = 0; i < kNumCases; ++i) { + if (caseFilter != nullptr && std::strcmp(kCases[i].name, caseFilter) != 0) { + continue; + } + int ret = RunCase(kCases[i], deviceId, stream); + if (ret != 0) { + std::fprintf(stderr, "[ERROR] case %s failed\n", kCases[i].name); + rc = 1; + break; + } + } + + if (stream != nullptr) + aclrtDestroyStream(stream); + aclrtResetDevice(deviceId); + aclFinalize(); + + return rc; +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmov/tmov.pto b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmov/tmov.pto new file mode 100644 index 000000000..c3835f1c0 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmov/tmov.pto @@ -0,0 +1,96 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// TileLang ST kernels for pto.tmov: tload(src) + tmov(src,dst) + tstore(dst). +// Multiple cases with different shapes and types in a single module. +// Based on pto-isa tmov_vect test cases. +// Compiled by ptoas --enable-insert-sync --enable-tile-op-expand --pto-backend=vpto + +module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { + + // Case 0: f32 64x64 (4096 elements) + func.func @TMOV_f32_32x32(%src_ptr: !pto.ptr, %dst_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c32 = arith.constant 32 : index + %c1024 = arith.constant 1024 : index + + %src_view = pto.make_tensor_view %src_ptr, + shape = [%c1, %c1, %c1, %c32, %c32], + strides = [%c1024, %c1024, %c1024, %c32, %c1] + : !pto.tensor_view<1x1x1x32x32xf32> + %dst_view = pto.make_tensor_view %dst_ptr, + shape = [%c1, %c1, %c1, %c32, %c32], + strides = [%c1024, %c1024, %c1024, %c32, %c1] + : !pto.tensor_view<1x1x1x32x32xf32> + + %src_part = pto.partition_view %src_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c32, %c32] + : !pto.tensor_view<1x1x1x32x32xf32> -> !pto.partition_tensor_view<1x1x1x32x32xf32> + %dst_part = pto.partition_view %dst_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c32, %c32] + : !pto.tensor_view<1x1x1x32x32xf32> -> !pto.partition_tensor_view<1x1x1x32x32xf32> + + %src = pto.alloc_tile : !pto.tile_buf + %dst = pto.alloc_tile : !pto.tile_buf + + pto.tload ins(%src_part : !pto.partition_tensor_view<1x1x1x32x32xf32>) + outs(%src : !pto.tile_buf) + + pto.tmov ins(%src : !pto.tile_buf) + outs(%dst : !pto.tile_buf) + + pto.tstore ins(%dst : !pto.tile_buf) + outs(%dst_part : !pto.partition_tensor_view<1x1x1x32x32xf32>) + return + } + + // Case 2: f32 128x128 (16384 elements) + + func.func @TMOV_f16_32x32(%src_ptr: !pto.ptr, %dst_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c32 = arith.constant 32 : index + %c1024 = arith.constant 1024 : index + + %src_view = pto.make_tensor_view %src_ptr, + shape = [%c1, %c1, %c1, %c32, %c32], + strides = [%c1024, %c1024, %c1024, %c32, %c1] + : !pto.tensor_view<1x1x1x32x32xf16> + %dst_view = pto.make_tensor_view %dst_ptr, + shape = [%c1, %c1, %c1, %c32, %c32], + strides = [%c1024, %c1024, %c1024, %c32, %c1] + : !pto.tensor_view<1x1x1x32x32xf16> + + %src_part = pto.partition_view %src_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c32, %c32] + : !pto.tensor_view<1x1x1x32x32xf16> -> !pto.partition_tensor_view<1x1x1x32x32xf16> + %dst_part = pto.partition_view %dst_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c32, %c32] + : !pto.tensor_view<1x1x1x32x32xf16> -> !pto.partition_tensor_view<1x1x1x32x32xf16> + + %src = pto.alloc_tile : !pto.tile_buf + %dst = pto.alloc_tile : !pto.tile_buf + + pto.tload ins(%src_part : !pto.partition_tensor_view<1x1x1x32x32xf16>) + outs(%src : !pto.tile_buf) + + pto.tmov ins(%src : !pto.tile_buf) + outs(%dst : !pto.tile_buf) + + pto.tstore ins(%dst : !pto.tile_buf) + outs(%dst_part : !pto.partition_tensor_view<1x1x1x32x32xf16>) + return + } + + // Case 7: f16 128x128 (16384 elements) +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmrgsort/CMakeLists.txt b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmrgsort/CMakeLists.txt new file mode 100644 index 000000000..d384bd123 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmrgsort/CMakeLists.txt @@ -0,0 +1,9 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +pto_tilelang_vec_st(tmrgsort) diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmrgsort/cases.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmrgsort/cases.py new file mode 100644 index 000000000..cf9f088d9 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmrgsort/cases.py @@ -0,0 +1,378 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +"""Single source of truth for tmrgsort ST test cases. + +Each case defines: + - name: case identifier, used as subdirectory name and by main.cpp kCases[]. + - dtype: numpy dtype (e.g. np.float32, np.float16). + - format: "single" for Format1 (1-list internal block sorting), + "multi" for Format2-4 (multi-list merge sort). + - src_shape: (rows, cols) - allocated source tile dimensions. + For Format1: single input list. + For multi-list: list of shapes for each input. + - dst_shape: (rows, cols) - allocated destination tile dimensions. + - valid_shape: (valid_rows, valid_cols) - effective computation region. + - block_len: For Format1: block length in elements (must divide src_cols by 4). + - list_num: For multi-list: number of input lists (2, 3, or 4). + - src_cols: For multi-list: list of valid cols for each input list. + - topk: For multi-list: top-k output count. + - exhausted: For multi-list: whether to enable exhausted suspension. + - eps: tolerance for numpy.allclose (atol and rtol). + +tmrgsort semantics: + - Format1 (single list): Sorts 4 internal blocks of src using vmrgsort4. + Each block is sorted independently, then merged. + Output: interleaved (sorted_value, original_index) pairs. + - Format2-4 (multi-list): Merges 2-4 sorted input lists into one sorted output. + Each input list must already be sorted (in descending order). + Output: top-k sorted elements from merged lists. + +gen_data.py and compare.py both import this list to avoid redundant definitions. +""" + +import numpy as np + +CASES = [ + # Format1: single list (internal block sorting) + # Transplanted from pto-isa case_single1: TMrgsortSingle + # Shape uses FLOAT ELEMENT count (matching pto-isa kGCols convention) + # src_cols=256 float elements = 128 (value,index) structures + # block_len=64 float elements = 32 structures/block, 4 blocks total + { + "name": "f32_single_1x256_b64", + "dtype": np.float32, + "format": "single", + "src_shape": (1, 256), # kGCols=256 float elements + "dst_shape": (1, 256), # kGCols=256 float elements + "valid_shape": (1, 256), + "block_len": 64, # float elements (=32 structures) + "eps": 1e-6, + }, + # Transplanted from pto-isa case_single2: TMrgsortSingle + # GCols=320 > TCols=256, global memory has padding, kernel uses TCols + # src_cols=320 float elements (global), valid_cols=256 float elements (tile) + # block_len=64 float elements = 32 structures/block + { + "name": "f32_single_1x320_b64", + "dtype": np.float32, + "format": "single", + "src_shape": (1, 320), # kGCols=320 float elements (global) + "dst_shape": (1, 320), # kGCols=320 float elements (global) + "valid_shape": (1, 256), # kTCols=256 (effective tile region) + "block_len": 64, # float elements (=32 structures) + "eps": 1e-6, + }, + # Transplanted from pto-isa case_single3: TMrgsortSingle + # cols=512 float elements = 256 structures + # block_len=64 float elements = 32 structures/block, 4 blocks total + { + "name": "f32_single_1x512_b64", + "dtype": np.float32, + "format": "single", + "src_shape": (1, 512), # kGCols=512 float elements + "dst_shape": (1, 512), # kGCols=512 float elements + "valid_shape": (1, 512), + "block_len": 64, # float elements (=32 structures) + "eps": 1e-6, + }, + # Transplanted from pto-isa case_single4: TMrgsortSingle + # kGCols=640 > kTCols=512, global memory has padding, kernel uses kTCols + # src_cols=640 float elements (global), valid_cols=512 float elements (tile) + # block_len=64 float elements = 32 structures/block + { + "name": "f32_single_1x640_b64", + "dtype": np.float32, + "format": "single", + "src_shape": (1, 640), # kGCols=640 float elements (global) + "dst_shape": (1, 640), # kGCols=640 float elements (global) + "valid_shape": (1, 512), # kTCols=512 (effective tile region) + "block_len": 64, # float elements (=32 structures) + "eps": 1e-6, + }, + # Transplanted from pto-isa case_single5: TMrgsortSingle + # uint16_t maps to float16 (half) in Ascend C + # TYPE_COEF=2: kGCols*2=512, kTCols*2=512, blockLen*2=128 (kernel internal) + # src_shape uses TYPE_COEF-adjusted counts: 512 f16 elements = 128 structures + # block_len=64 template units → 128 f16 elements in kernel = 32 structures/block + { + "name": "f16_single_1x256_b64", + "dtype": np.float16, + "format": "single", + "src_shape": (1, 512), # kGCols*TYPE_COEF=512 f16 elements = 128 structures + "dst_shape": (1, 512), # kGCols*TYPE_COEF=512 f16 elements + "valid_shape": (1, 512), + "block_len": 128, # block_len*TYPE_COEF=128 f16 elements = 32 structures + "eps": 1e-3, # f16 has lower precision + }, + # Transplanted from pto-isa case_single6: TMrgsortSingle + # TYPE_COEF=2: kGCols*2=640, kTCols*2=512, blockLen*2=128 (kernel internal) + # kGCols=320 > kTCols=256, global memory has padding + # src_shape uses TYPE_COEF-adjusted: 640 f16 elements (global), 512 f16 (valid) + { + "name": "f16_single_1x320_b64", + "dtype": np.float16, + "format": "single", + "src_shape": (1, 640), # kGCols*TYPE_COEF=640 f16 elements (global) + "dst_shape": (1, 640), # kGCols*TYPE_COEF=640 f16 elements (global) + "valid_shape": (1, 512), # kTCols*TYPE_COEF=512 (effective tile region) + "block_len": 128, # block_len*TYPE_COEF=128 f16 elements = 32 structures + "eps": 1e-3, + }, + # Transplanted from pto-isa case_single7: TMrgsortSingle + # TYPE_COEF=2: kGCols*2=1024, kTCols*2=1024, blockLen*2=128 (kernel internal) + # src_shape uses TYPE_COEF-adjusted: 1024 f16 elements = 256 structures + { + "name": "f16_single_1x512_b64", + "dtype": np.float16, + "format": "single", + "src_shape": (1, 1024), # kGCols*TYPE_COEF=1024 f16 elements = 256 structures + "dst_shape": (1, 1024), # kGCols*TYPE_COEF=1024 f16 elements + "valid_shape": (1, 1024), + "block_len": 128, # block_len*TYPE_COEF=128 f16 elements = 32 structures + "eps": 1e-3, + }, + # Transplanted from pto-isa case_single8: TMrgsortSingle + # TYPE_COEF=2: kGCols*2=2048, kTCols*2=2048, blockLen*2=512 (kernel internal) + # src_shape uses TYPE_COEF-adjusted: 2048 f16 elements = 512 structures + { + "name": "f16_single_1x1024_b256", + "dtype": np.float16, + "format": "single", + "src_shape": (1, 2048), # kGCols*TYPE_COEF=2048 f16 elements = 512 structures + "dst_shape": (1, 2048), # kGCols*TYPE_COEF=2048 f16 elements + "valid_shape": (1, 2048), + "block_len": 512, # block_len*TYPE_COEF=512 f16 elements = 128 structures + "eps": 1e-3, + }, + # Format2: multi-list merge (2-list merge) + { + "name": "f32_2list_b64_basic", + "dtype": np.float32, + "format": "multi", + "list_num": 2, + "src_cols": [128, 128], + "src_shape": [(1, 256), (1, 256)], + "dst_shape": (1, 256), + "valid_shape": (1, 256), + "topk": 128, + "exhausted": False, + "eps": 1e-6, + }, + { + "name": "f16_2list_b64_basic", + "dtype": np.float16, + "format": "multi", + "list_num": 2, + "src_cols": [64, 64], # 64 structures per list (match src_shape) + "src_shape": [(1, 256), (1, 256)], # 256 f16 elements = 64 structures + "dst_shape": (1, 256), + "valid_shape": (1, 256), + "topk": 64, # topk should match dst capacity + "exhausted": False, + "eps": 1e-3, + }, + # Format2: exhausted=true cases (aligned with pto-isa case_exhausted1) + # pto-isa template: kGCols_=64 (elements) → 32 structures per list + # TOPK=128 (elements) → 64 structures output + { + "name": "f32_2list_exhausted", + "dtype": np.float32, + "format": "multi", + "list_num": 2, + "src_cols": [32, 32], # 32 structures per list (64 elements / 2) + "src_shape": [(1, 64), (1, 64)], # 64 f32 elements = 32 structures + "dst_shape": (1, 128), # 128 f32 elements = 64 structures (=TOPK) + "valid_shape": (1, 128), # match dst_shape + "topk": 64, # topk in structures (=64 structures) + "exhausted": True, + "eps": 1e-6, + }, + # Format3: 3-list merge sort + { + "name": "f32_3list_b64_basic", + "dtype": np.float32, + "format": "multi", + "list_num": 3, + "src_cols": [64, 64, 64], # 64 structures per list + "src_shape": [(1, 128), (1, 128), (1, 128)], # 128 f32 elements = 64 structures each + "dst_shape": (1, 256), # 256 f32 elements = 128 structures + "valid_shape": (1, 256), + "topk": 128, # topk structures (192 available, output 128) + "exhausted": False, + "eps": 1e-6, + }, + # Format4: 4-list merge sort + { + "name": "f32_4list_b32_basic", + "dtype": np.float32, + "format": "multi", + "list_num": 4, + "src_cols": [64, 64, 64, 64], + "src_shape": [(1, 128), (1, 128), (1, 128), (1, 128)], + "dst_shape": (1, 512), + "valid_shape": (1, 512), + "topk": 256, + "exhausted": False, + "eps": 1e-6, + }, + { + "name": "f16_4list_b64_basic", + "dtype": np.float16, + "format": "multi", + "list_num": 4, + "src_cols": [64, 64, 64, 64], # 64 structures per list + "src_shape": [(1, 256), (1, 256), (1, 256), (1, 256)], # 256 f16 elements = 64 structures each + "dst_shape": (1, 1024), # 1024 f16 elements = 256 structures + "valid_shape": (1, 1024), + "topk": 256, # topk structures (256 available, output 256) + "exhausted": False, + "eps": 1e-3, + }, + # Format3 variants: non-uniform cols + { + "name": "f32_3list_non_uniform", + "dtype": np.float32, + "format": "multi", + "list_num": 3, + "src_cols": [64, 64, 32], # non-uniform: 64,64,32 structures + "src_shape": [(1, 128), (1, 128), (1, 64)], # f32 elements + "dst_shape": (1, 128), # 128 f32 elements = 64 structures + "valid_shape": (1, 128), + "topk": 64, # structures (total=160 available, output topk=64) + "exhausted": False, + "eps": 1e-6, + }, + # Format3 variants: f16 4-list basic + # tmp tile cols=512 can hold max 256 structures for f16 (512/2=256) + # src_cols in STRUCTURES, srcShape in ELEMENTS (f16: 4 elems/struct) + { + "name": "f16_4list_basic", + "dtype": np.float16, + "format": "multi", + "list_num": 4, + "src_cols": [64, 64, 64, 64], + "src_shape": [(1, 256), (1, 256), (1, 256), (1, 256)], + "dst_shape": (1, 1024), + "valid_shape": (1, 1024), + "topk": 256, + "exhausted": False, + "eps": 1e-3, + }, + # Format3 variants: f16 exhausted (aligned with pto-isa case_exhausted2) + # pto-isa template: kGCols_=256 (DataType=float sized), TOPK=768 (float sized) + # In f16 units: 256 float-sized * 4 / 2 = 512 f16 elements per input = 128 structures + # TOPK: 768 float-sized * 4 / 2 = 1536 f16 elements output = 384 structures + { + "name": "f16_3list_exhausted", + "dtype": np.float16, + "format": "multi", + "list_num": 3, + "src_cols": [128, 128, 128], # 128 structures per list (512 f16 elements) + "src_shape": [(1, 512), (1, 512), (1, 512)], # 512 f16 elements = 128 structures + "dst_shape": (1, 1536), # 1536 f16 elements = 384 structures (=TOPK) + "valid_shape": (1, 1536), + "topk": 384, # structures (=384) + "exhausted": True, + "eps": 1e-3, + }, + # Format4 variants: non-uniform cols + { + "name": "f32_4list_non_uniform", + "dtype": np.float32, + "format": "multi", + "list_num": 4, + "src_cols": [64, 64, 64, 32], # non-uniform: 64,64,64,32 structures + "src_shape": [(1, 128), (1, 128), (1, 128), (1, 64)], # f32 elements + "dst_shape": (1, 448), # 448 f32 elements = 224 structures + "valid_shape": (1, 448), + "topk": 224, # structures (total=224, output all) + "exhausted": False, + "eps": 1e-6, + }, + + # Format5: TopK (full sorting with top-k output) + # Following pto-isa case_topk1-6 + # Input: unsorted raw data (value-index interleaved) + # Output: top-k sorted elements + { + "name": "f32_topk_2048_1024", + "dtype": np.float32, + "format": "topk", + "src_shape": (1, 2048), # 2048 f32 elements = 1024 structs (input unsorted) + "dst_shape": (1, 1024), # 1024 f32 elements = 512 structs (output topk) + "valid_shape": (1, 2048), # full input cols + "topk": 512, # output structures count + "block_len": 64, # initial block length in elements + "eps": 1e-6, + }, + { + "name": "f32_topk_2048_2048", + "dtype": np.float32, + "format": "topk", + "src_shape": (1, 2048), # 2048 f32 elements = 1024 structs + "dst_shape": (1, 2048), # 2048 f32 elements = 1024 structs (output all) + "valid_shape": (1, 2048), + "topk": 1024, # output all structures + "block_len": 64, + "eps": 1e-6, + }, + { + "name": "f32_topk_1280_512", + "dtype": np.float32, + "format": "topk", + "src_shape": (1, 1280), # 1280 f32 elements = 640 structs + "dst_shape": (1, 512), # 512 f32 elements = 256 structs + "valid_shape": (1, 1280), + "topk": 256, # output 256 structures + "block_len": 64, + "eps": 1e-6, + }, + { + "name": "f16_topk_2048_1024", + "dtype": np.float16, + "format": "topk", + "src_shape": (1, 2048), # 2048 f16 elements = 512 structs + "dst_shape": (1, 1024), # 1024 f16 elements = 256 structs + "valid_shape": (1, 2048), + "topk": 256, # output 256 structures + "block_len": 64, + "eps": 1e-3, + }, + { + "name": "f16_topk_2048_2048", + "dtype": np.float16, + "format": "topk", + "src_shape": (1, 2048), # 2048 f16 elements = 512 structs + "dst_shape": (1, 2048), # output all + "valid_shape": (1, 2048), + "topk": 512, # output all structures + "block_len": 64, + "eps": 1e-3, + }, + { + "name": "f16_topk_1280_512", + "dtype": np.float16, + "format": "topk", + "src_shape": (1, 1280), # 1280 f16 elements = 320 structs + "dst_shape": (1, 512), # 512 f16 elements = 128 structs + "valid_shape": (1, 1280), + "topk": 128, # output 128 structures + "block_len": 64, + "eps": 1e-3, + } +] + +_SMOKE_CASE_NAMES = ['f32_single_1x256_b64', 'f16_topk_1280_512'] +_SMOKE_CASE_NAME_SET = set(_SMOKE_CASE_NAMES) +_missing = [name for name in _SMOKE_CASE_NAMES if name not in {case["name"] for case in CASES}] +if _missing: + raise RuntimeError("unknown smoke case(s): " + ", ".join(_missing)) +CASES = [case for case in CASES if case["name"] in _SMOKE_CASE_NAME_SET] diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmrgsort/compare.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmrgsort/compare.py new file mode 100644 index 000000000..9efa6d44d --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmrgsort/compare.py @@ -0,0 +1,221 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import os +import sys +import numpy as np +import struct + +# Add parent directory to path for st_common import +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +from st_common import result_cmp, style_fail, style_pass + +from cases import CASES + + +def read_value_index_pairs(filepath, dtype, count): + """Read interleaved (value, index) pairs from file. + + Format: value followed by index (uint32). + For f16: value (2 bytes) + padding (2 bytes) + index (4 bytes) = 8 bytes per pair. + For f32: value (4 bytes) + index (4 bytes) = 8 bytes per pair. + """ + values = [] + indices = [] + + struct_fmt = 'fI' if dtype == np.float32 else 'e2xI' + struct_size = struct.calcsize(struct_fmt) + + with open(filepath, 'rb') as f: + for _ in range(count): + data = f.read(struct_size) + if not data: + break + unpacked = struct.unpack(struct_fmt, data) + values.append(unpacked[0]) + indices.append(unpacked[1]) + + return np.array(values, dtype=dtype), np.array(indices, dtype=np.uint32) + + +def handle_output_data(golden_vals, golden_idx, output_vals, output_idx): + """Handle exhausted case: zero output values and indices where golden values are 0. + + Following pto-isa HandleOutputData logic: + - Scan from end, find first non-zero golden value + - Zero output values where golden values are 0 + + Also zero output indices where golden indices are 0 (matching gen_data.py behavior). + """ + size = len(golden_vals) + i = size - 1 + while i > 0: + if golden_vals[i] == 0.0: + output_vals[i] = 0.0 + if golden_idx[i] == 0: + output_idx[i] = 0 + i -= 1 + else: + return + + +def compare_multilist(case): + """Compare multi-list merge sort output. + + For multi-list format: + - Read input0.bin, input1.bin, etc. + - Read output.bin + - Compare top-k elements with golden.bin + """ + dtype = case["dtype"] + list_num = case["list_num"] + src_cols = case["src_cols"] + topk = case["topk"] + exhausted = case.get("exhausted", False) + + # Calculate element divisor + if dtype == np.float16: + elem_divisor = 4 + else: + elem_divisor = 2 + + # Total structures to compare + total_structures = sum(src_cols) + + # Read golden output + golden_vals, golden_indices = read_value_index_pairs( + os.path.join(case["name"], "golden.bin"), dtype, total_structures + ) + + # Read actual output + output_vals, output_indices = read_value_index_pairs( + os.path.join(case["name"], "output.bin"), dtype, total_structures + ) + + if exhausted: + handle_output_data(golden_vals, golden_indices, output_vals, output_indices) + + # Compare top-k elements (only compare the valid output) + vals_ok = result_cmp(golden_vals[:topk], output_vals[:topk], case["eps"]) + indices_ok = np.allclose(golden_indices[:topk], output_indices[:topk], atol=0, rtol=0) + + return vals_ok and indices_ok + + +def compare_topk(case): + """Compare TopK output. + + For TopK format: + - Read input0.bin (unsorted raw data) + - Read output.bin (top-k sorted data) + - Compare with golden.bin + """ + dtype = case["dtype"] + valid_shape = case["valid_shape"] + valid_rows, valid_cols = valid_shape + topk = case["topk"] + + # Get element divisor based on dtype + if dtype == np.float16: + elem_divisor = 4 + else: + elem_divisor = 2 + + # Total structures in input + total_structures = valid_cols // elem_divisor + + # Read golden output + golden_vals, golden_indices = read_value_index_pairs( + os.path.join(case["name"], "golden.bin"), dtype, total_structures + ) + + # Read actual output + output_vals, output_indices = read_value_index_pairs( + os.path.join(case["name"], "output.bin"), dtype, topk + ) + + # Compare top-k elements + vals_ok = result_cmp(golden_vals[:topk], output_vals[:topk], case["eps"]) + indices_ok = np.allclose(golden_indices[:topk], output_indices[:topk], atol=0, rtol=0) + + return vals_ok and indices_ok + + +def main(): + case_filter = sys.argv[1] if len(sys.argv) > 1 else None + + all_passed = True + for case in CASES: + if case_filter is not None and case["name"] != case_filter: + continue + + format_type = case.get("format", "single") + + if format_type == "single": + dtype = case["dtype"] + valid_shape = case["valid_shape"] + valid_rows, valid_cols = valid_shape + block_len = case["block_len"] + + # Get element divisor based on dtype + if dtype == np.float16: + elem_divisor = 4 + else: + elem_divisor = 2 + + cols = valid_cols // elem_divisor + + golden_vals, golden_indices = read_value_index_pairs( + os.path.join(case["name"], "golden.bin"), dtype, cols + ) + output_vals, output_indices = read_value_index_pairs( + os.path.join(case["name"], "output.bin"), dtype, cols + ) + + vals_ok = result_cmp(golden_vals, output_vals, case["eps"]) + indices_ok = np.allclose(golden_indices, output_indices, atol=0, rtol=0) + + if vals_ok and indices_ok: + print(style_pass(f"[INFO] {case['name']}: compare passed")) + else: + if not vals_ok: + print(style_fail(f"[ERROR] {case['name']}: values mismatch")) + if not indices_ok: + print(style_fail(f"[ERROR] {case['name']}: indices mismatch")) + all_passed = False + + elif format_type == "multi": + ok = compare_multilist(case) + if ok: + print(style_pass(f"[INFO] {case['name']}: compare passed")) + else: + print(style_fail(f"[ERROR] {case['name']}: values or indices mismatch")) + all_passed = False + + elif format_type == "topk": + ok = compare_topk(case) + if ok: + print(style_pass(f"[INFO] {case['name']}: compare passed")) + else: + print(style_fail(f"[ERROR] {case['name']}: values or indices mismatch")) + all_passed = False + + else: + print(style_fail(f"[ERROR] {case['name']}: unsupported format {format_type}")) + all_passed = False + + if not all_passed: + sys.exit(2) + print(style_pass("[INFO] all cases passed")) + + +if __name__ == "__main__": + main() diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmrgsort/gen_data.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmrgsort/gen_data.py new file mode 100644 index 000000000..510a096bc --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmrgsort/gen_data.py @@ -0,0 +1,418 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import numpy as np +import os +import sys +import struct +import ctypes + +# Add parent directory to path for st_common import +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +from st_common import setup_case_rng, save_case_data + +from cases import CASES + +BLOCK_NUM = 4 +STRUCT_SIZE = 8 # bytes per structure (value + index) + + +def find_and_zero(arr, tar): + for item in arr: + if not isinstance(item, (np.floating)): + return -1 + if not all(isinstance(x, (np.floating)) for x in arr): + raise ValueError("The input must be a list of numbers.") + if not isinstance(tar, (np.floating)): + return -1 + + n = len(arr) + for i in range(n - 1, -1, -1): + if arr[i] == tar: + for j in range(i + 1, n): + arr[j] = 0 + return i + return -1 + + +def zero_after_index(arr, i): + if i < 0 or i >= len(arr): + return + for j in range(i + 1, len(arr)): + arr[j] = 0 + + +def handle_exhausted_list(input_num, topk_sorted_output_global, topk_sorted_idx_global, last_data): + for i in range(input_num): + zero_index = find_and_zero(topk_sorted_output_global, last_data[i]) + zero_after_index(topk_sorted_idx_global, zero_index) + + +def _to_tuple(shape): + """Convert shape to tuple if needed.""" + if isinstance(shape, tuple): + return shape + return tuple(shape) + + +def get_elem_divisor(dtype): + """Get element divisor based on dtype. + + A structure is 8 bytes: + - f32 (4 bytes): 8 / 4 = 2 elems per struct + - f16 (2 bytes): 8 / 2 = 4 elems per struct + """ + if dtype == np.float16: + return 4 + return 2 + + +def write_value_index_pair(f, value, index, dtype): + """Write a (value, index) pair to file. + + Format: value followed by index (uint32). + For f16: value (2 bytes) + padding (2 bytes) + index (4 bytes). + For f32: value (4 bytes) + index (4 bytes). + """ + if dtype == np.float32: + packed_data = struct.pack('fI', float(value), ctypes.c_uint32(index).value) + f.write(packed_data) + elif dtype == np.float16: + # f16: directly pack value (np.float16), not float(value) + # Following pto-isa: struct.pack('e2xI', value, ...) + packed_data = struct.pack('e2xI', value, ctypes.c_uint32(index).value) + f.write(packed_data) + + +def gen_golden_single(case): + """Generate golden data for Format1 (single list internal block sorting). + + Following pto-isa gen_data.py logic exactly: + - cols = src_cols // elem_divisor (STRUCTURE count, using full src_cols not valid_cols) + - list_col = block_len // elem_divisor (STRUCTURES per block) + - block_lens = list_col * 4 (STRUCTURES per vmrgsort4 call) + - block_lens_floats = block_len * 4 (FLOATS per vmrgsort4 call) + + Process: + 1. Generate random data (cols structures) + 2. Reshape into blocks (each list_col structures) + 3. Sort each block internally -> input0.bin + 4. Reshape into groups (each block_lens structures) + 5. Globally sort each group -> golden.bin + + For cases where src_cols > valid_cols (padding), generate full src_cols with zeros padding. + """ + dtype = case["dtype"] + src_shape = _to_tuple(case["src_shape"]) + dst_shape = _to_tuple(case["dst_shape"]) + valid_shape = _to_tuple(case["valid_shape"]) + block_len = case["block_len"] + + src_rows, src_cols = src_shape + valid_rows, valid_cols = valid_shape + + # Get element divisor based on dtype (2 for f32, 4 for f16) + elem_divisor = get_elem_divisor(dtype) + + # Use FULL src_cols for file size (matching pto-isa kGCols) + cols = src_cols // elem_divisor # total structures in file + valid_structs = valid_cols // elem_divisor # valid structures for computation + list_col = block_len // elem_divisor # structures per block + block_lens = list_col * 4 # structures per vmrgsort4 call + block_lens_floats = block_len * 4 # floats per vmrgsort4 call + + repeat_times = valid_structs // block_lens # vmrgsort4 call times (use valid_structs) + + # Generate random data only for valid portion (matching pto-isa which uses kTCols for computation) + input_arr = np.random.uniform(low=0.0, high=1.0, size=(1, valid_structs)).astype(dtype) + idx_arr = np.arange(valid_structs, dtype=np.uint32) + + # Step 1: Sort each block internally + # Reshape to (total_blocks, list_col) + input_reshaped = input_arr.reshape(-1, list_col) + idx_reshaped = idx_arr.reshape(-1, list_col) + + # Sort each block descending + sorted_indices = np.argsort(-input_reshaped, kind='stable', axis=1) + sorted_input = np.take_along_axis(input_reshaped, sorted_indices, axis=1) + sorted_idx = np.take_along_axis(idx_reshaped, sorted_indices, axis=1) + + # Flatten back -> input0.bin (needs padding if cols > valid_structs) + flat_input = sorted_input.flatten() + flat_idx = sorted_idx.flatten() + + # Pad input with zeros if needed (for src_cols > valid_cols cases) + if cols > valid_structs: + pad_input = np.zeros(cols - valid_structs, dtype=dtype) + pad_idx = np.zeros(cols - valid_structs, dtype=np.uint32) + flat_input = np.concatenate((flat_input, pad_input)) + flat_idx = np.concatenate((flat_idx, pad_idx)) + + # Step 2: Generate golden (globally sort each group, using valid_structs) + # Take complete groups from valid portion + input_group = flat_input[:valid_structs // block_lens * block_lens] + idx_group = flat_idx[:valid_structs // block_lens * block_lens] + + # Reshape to (repeat_times, block_lens) + single_output_reshape = input_group.reshape(-1, block_lens) + single_idx_reshape = idx_group.reshape(-1, block_lens) + + # Globally sort each group descending + single_sorted_indices = np.argsort(-single_output_reshape, kind='stable', axis=1) + golden_values = np.take_along_axis(single_output_reshape, single_sorted_indices, axis=1).flatten() + golden_indices = np.take_along_axis(single_idx_reshape, single_sorted_indices, axis=1).flatten() + + # Handle remaining elements from valid portion + if valid_structs % block_lens != 0: + zeros_output = np.zeros(valid_structs % block_lens, dtype=golden_values.dtype) + zeros_index = np.zeros(valid_structs % block_lens, dtype=np.uint32) + golden_values = np.concatenate((golden_values, zeros_output)) + golden_indices = np.concatenate((golden_indices, zeros_index)) + + # Pad golden with zeros for full file size (cols > valid_structs) + if cols > valid_structs: + pad_output = np.zeros(cols - valid_structs, dtype=golden_values.dtype) + pad_index = np.zeros(cols - valid_structs, dtype=np.uint32) + golden_values = np.concatenate((golden_values, pad_output)) + golden_indices = np.concatenate((golden_indices, pad_index)) + + os.makedirs(case["name"], exist_ok=True) + with open(os.path.join(case["name"], "input0.bin"), 'wb') as f: + for val, idx in zip(flat_input, flat_idx): + write_value_index_pair(f, val, idx, dtype) + + with open(os.path.join(case["name"], "golden.bin"), 'wb') as f: + for val, idx in zip(golden_values, golden_indices): + write_value_index_pair(f, val, idx, dtype) + + print(f"[INFO] gen_data: {case['name']} src_cols={src_cols} valid_cols={valid_cols} " + f"cols={cols} list_col={list_col} block_lens={block_lens} repeat_times={repeat_times}") + + +def gen_golden_multilist(case): + """Generate golden data for Format2 (multi-list merge sort). + + Following pto-isa gen_data.py logic for multi-list: + 1. Generate sorted data for each input list (descending order) + 2. Concatenate all lists and globally sort (descending) + 3. Take top-k elements + 4. If exhausted=true, handle special termination logic + + Each input list is pre-sorted in descending order. + Output is top-k merged sorted elements. + """ + dtype = case["dtype"] + list_num = case["list_num"] + src_cols = case["src_cols"] # structures per list + topk = case["topk"] + exhausted = case.get("exhausted", False) + + # Calculate actual cols (in elements) per src + # Each structure = (value, index) pair = 8 bytes + # For f32: 2 elements per structure (4 bytes value + 4 bytes index) + # For f16: 4 elements per structure (2 bytes value + 2 bytes padding + 4 bytes index) + elem_divisor = get_elem_divisor(dtype) + + # Generate sorted data for each input list + output_arr_list = [] + output_idx_list = [] + last_data = [] + + total_structures = sum(src_cols) + + for i in range(list_num): + cols_i = src_cols[i] + # Generate random data for this list + input_arr = np.random.uniform(low=0.0, high=1.0, size=(1, cols_i)).astype(dtype) + idx_arr = np.arange(cols_i, dtype=np.uint32).reshape(1, cols_i) # Reshape to match input_arr + + # Sort in descending order + sorted_indices = np.argsort(-input_arr, kind='stable', axis=1) + sorted_input = np.take_along_axis(input_arr, sorted_indices, axis=1) + sorted_idx = np.take_along_axis(idx_arr, sorted_indices, axis=1) + + # Flatten + flat_input_i = sorted_input.flatten() + flat_idx_i = sorted_idx.flatten() + + output_arr_list.append(flat_input_i) + output_idx_list.append(flat_idx_i) + + # Track last element for exhausted case + if cols_i > 0: + last_data.append(flat_input_i[-1]) + else: + last_data.append(0) + + # Concatenate and globally sort (descending) + flat_input_group = np.concatenate(output_arr_list).flatten() + flat_idx_group = np.concatenate(output_idx_list).flatten() + + sorted_indices_global = np.argsort(-flat_input_group, kind='stable') + sorted_output_global = flat_input_group[sorted_indices_global] + sorted_idx_global = flat_idx_group[sorted_indices_global] + + # Take top-k + topk_sorted_output = sorted_output_global[:topk] + topk_sorted_idx = sorted_idx_global[:topk] + + # Pad zeros if needed + zeros_output = np.zeros(total_structures - topk, dtype=topk_sorted_output.dtype) + zeros_index = np.zeros(total_structures - topk, dtype=np.uint32) + topk_sorted_output_global = np.concatenate((topk_sorted_output, zeros_output)) + topk_sorted_idx_global = np.concatenate((topk_sorted_idx, zeros_index)) + + if exhausted: + handle_exhausted_list(list_num, topk_sorted_output_global, topk_sorted_idx_global, last_data) + + # Write input files (input0.bin, input1.bin, etc.) + os.makedirs(case["name"], exist_ok=True) + for i in range(list_num): + input_file = os.path.join(case["name"], f"input{i}.bin") + with open(input_file, 'wb') as f: + for val, idx in zip(output_arr_list[i], output_idx_list[i]): + write_value_index_pair(f, val, idx, dtype) + + # Write golden output file + with open(os.path.join(case["name"], "golden.bin"), 'wb') as f: + for val, idx in zip(topk_sorted_output_global, topk_sorted_idx_global): + write_value_index_pair(f, val, idx, dtype) + + print(f"[INFO] gen_data: {case['name']} list_num={list_num} " + f"src_cols={src_cols} total_structures={total_structures} topk={topk} exhausted={exhausted}") + + +def gen_golden_topk(case): + """Generate golden data for TopK (full iterative merge). + + Following pto-isa RunTMrgsortTopk logic: + 1. Generate unsorted raw data -> input0.bin + 2. Initial: sort each block internally + 3. Iterative merge loop: blockLen *= 4 + - Each iteration: Format1 merge (4 blocks -> 1) + - Copy result back for next iteration + 4. Final: take top-k from globally sorted data -> golden.bin + + This matches the full TopK template implementation. + """ + dtype = case["dtype"] + src_shape = _to_tuple(case["src_shape"]) + dst_shape = _to_tuple(case["dst_shape"]) + valid_shape = _to_tuple(case["valid_shape"]) + topk = case["topk"] # output structures count + block_len = case["block_len"] + + src_rows, src_cols = src_shape + valid_rows, valid_cols = valid_shape + + # Get element divisor based on dtype + elem_divisor = get_elem_divisor(dtype) + + # Structure units (following pto-isa) + cols = valid_cols // elem_divisor # total structures + list_col = block_len // elem_divisor # structures per block + + # Generate unsorted raw data + input_arr = np.random.uniform(low=0.0, high=1.0, size=(1, cols)).astype(dtype) + idx_arr = np.arange(cols, dtype=np.uint32) + + # Step 1: Sort each block internally (Format1 preparation) + input_reshaped = input_arr.reshape(-1, list_col) + idx_reshaped = idx_arr.reshape(-1, list_col) + + sorted_indices = np.argsort(-input_reshaped, kind='stable', axis=1) + sorted_input = np.take_along_axis(input_reshaped, sorted_indices, axis=1) + sorted_idx = np.take_along_axis(idx_reshaped, sorted_indices, axis=1) + + # Flatten -> input0.bin (block-wise sorted) + flat_input = sorted_input.flatten() + flat_idx = sorted_idx.flatten() + + # Step 2: Iterative merge (blockLen *= 4 loop) + current_data = flat_input.copy() + current_idx = flat_idx.copy() + current_block_len = list_col # structures per block + + iteration = 1 + while current_block_len * 4 <= cols: + # Format1 merge at this block length + # Merge groups of 4 blocks into 1 + block_lens = current_block_len * 4 # structures per merge group + num_groups = cols // block_lens + + # Process each group + for g in range(num_groups): + start = g * block_lens + end = start + block_lens + group_vals = current_data[start:end] + group_idx = current_idx[start:end] + + # Sort this group descending + sort_indices = np.argsort(-group_vals, kind='stable') + current_data[start:end] = group_vals[sort_indices] + current_idx[start:end] = group_idx[sort_indices] + + # Update block length for next iteration + current_block_len = current_block_len * 4 + iteration += 1 + + # Step 3: Handle tail blocks (if current_block_len < cols) + # Simplified: just globally sort the remaining data + if current_block_len < cols: + # Global sort for tail handling + sort_indices = np.argsort(-current_data, kind='stable') + current_data = current_data[sort_indices] + current_idx = current_idx[sort_indices] + + # Step 4: Take top-k + golden_values = current_data[:topk] + golden_indices = current_idx[:topk] + + # Write files + os.makedirs(case["name"], exist_ok=True) + with open(os.path.join(case["name"], "input0.bin"), 'wb') as f: + for val, idx in zip(flat_input, flat_idx): + write_value_index_pair(f, val, idx, dtype) + + # Pad zeros if needed (to match dst capacity) + dst_structures = dst_shape[1] // elem_divisor + zeros_values = np.zeros(dst_structures - topk, dtype=golden_values.dtype) + zeros_indices = np.zeros(dst_structures - topk, dtype=np.uint32) + golden_values_padded = np.concatenate((golden_values, zeros_values)) + golden_indices_padded = np.concatenate((golden_indices, zeros_indices)) + + with open(os.path.join(case["name"], "golden.bin"), 'wb') as f: + for val, idx in zip(golden_values_padded, golden_indices_padded): + write_value_index_pair(f, val, idx, dtype) + + print(f"[INFO] gen_data: {case['name']} src_cols={src_cols} valid_cols={valid_cols} " + f"cols={cols} structures topk={topk} structures block_len={block_len} " + f"iterations={iteration}") + + +def gen_golden_data(): + """Generate golden data for all cases.""" + for case in CASES: + setup_case_rng(case) + + format_type = case.get("format", "single") + if format_type == "single": + gen_golden_single(case) + elif format_type == "multi": + gen_golden_multilist(case) + elif format_type == "topk": + gen_golden_topk(case) + else: + print(f"[WARN] Unsupported format: {format_type} for case {case['name']}") + + +if __name__ == "__main__": + gen_golden_data() diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmrgsort/launch.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmrgsort/launch.cpp new file mode 100644 index 000000000..1ffbd03d5 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmrgsort/launch.cpp @@ -0,0 +1,28 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +#include + +#ifndef AICORE +#define AICORE [aicore] +#endif + +// Case: f32_single_1x256_b64 (transplanted from pto-isa case_single1) + +extern "C" __global__ AICORE void TMRGSORT_f32_single_1x256_b64(__gm__ float *src, __gm__ float *dst); +extern "C" __global__ AICORE void TMRGSORT_f16_topk_1280_512(__gm__ half *src, __gm__ half *dst); + +void LaunchTMRGSORT_f32_single_1x256_b64(float *src, float *dst, void *stream) { + TMRGSORT_f32_single_1x256_b64<<<1, nullptr, stream>>>((__gm__ float *)src, (__gm__ float *)dst); +} + + + +void LaunchTMRGSORT_f16_topk_1280_512(uint16_t *src, uint16_t *dst, void *stream) { + TMRGSORT_f16_topk_1280_512<<<1, nullptr, stream>>>((__gm__ half *)src, (__gm__ half *)dst); +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmrgsort/main.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmrgsort/main.cpp new file mode 100644 index 000000000..071b10f02 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmrgsort/main.cpp @@ -0,0 +1,391 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// Host driver for TileLang tmrgsort ST — case-table driven. +// Each case launches a different kernel variant, reads/writes from per-case subdirectory. +// Numerical comparison is done externally by compare.py. + +#include "acl/acl.h" +#include "test_common.h" +#include +#include +#include +#include +#include +#include + +using namespace PtoTestCommon; + +// Kernel launch wrappers (defined in launch.cpp) +void LaunchTMRGSORT_f32_single_1x256_b64(float *src, float *dst, void *stream); +void LaunchTMRGSORT_f32_single_1x320_b64(float *src, float *dst, void *stream); +void LaunchTMRGSORT_f32_single_1x640_b64(float *src, float *dst, void *stream); +void LaunchTMRGSORT_f16_single_1x320_b64(uint16_t *src, uint16_t *dst, void *stream); +void LaunchTMRGSORT_f16_single_1x1024_b256(uint16_t *src, uint16_t *dst, void *stream); + +// Multi-list launch wrappers +void LaunchTMRGSORT_f16_2list_b64_basic(uint16_t *src0, uint16_t *src1, uint16_t *dst, void *stream); +void LaunchTMRGSORT_f32_3list_b64_basic(float *src0, float *src1, float *src2, float *dst, void *stream); +void LaunchTMRGSORT_f16_3list_exhausted(uint16_t *src0, uint16_t *src1, uint16_t *src2, uint16_t *dst, void *stream); +void LaunchTMRGSORT_f32_4list_non_uniform(float *src0, float *src1, float *src2, float *src3, float *dst, void *stream); +void LaunchTMRGSORT_f16_4list_basic(uint16_t *src0, uint16_t *src1, uint16_t *src2, uint16_t *src3, uint16_t *dst, void *stream); + +// TopK launch wrappers +void LaunchTMRGSORT_f32_topk_2048_2048(float *src, float *dst, void *stream); +void LaunchTMRGSORT_f16_topk_2048_1024(uint16_t *src, uint16_t *dst, void *stream); +void LaunchTMRGSORT_f16_topk_1280_512(uint16_t *src, uint16_t *dst, void *stream); + +using LaunchFn = void (*)(void *, void *, void *); +using LaunchFn2 = void (*)(void *, void *, void *, void *); +using LaunchFn3 = void (*)(void *, void *, void *, void *, void *); +using LaunchFn4 = void (*)(void *, void *, void *, void *, void *, void *); + +struct TestCase { + const char *name; + int listNum; // 1 for single-list, 2/3/4 for multi-list + LaunchFn launch; // for single-list + LaunchFn2 launch2; // for 2-list + LaunchFn3 launch3; // for 3-list + LaunchFn4 launch4; // for 4-list + size_t srcRows; + size_t srcCols; // for single-list: element count + size_t srcCols0; // for multi-list: src0 element count + size_t srcCols1; // for multi-list: src1 element count + size_t srcCols2; // for multi-list: src2 element count (for 3/4-list) + size_t srcCols3; // for multi-list: src3 element count (for 4-list) + size_t dstRows; + size_t dstCols; // element count + size_t elemSize; // bytes per element (4 for f32, 2 for f16) + size_t structSize; // 8 bytes per (value, index) pair + size_t elemsPerStruct; // structSize / elemSize (2 for f32, 4 for f16) +}; + +static const TestCase kCases[] = { + // Single-list cases (Format1) +{"f32_single_1x256_b64", 1, reinterpret_cast(LaunchTMRGSORT_f32_single_1x256_b64), nullptr, nullptr, nullptr, 1, 256, 0, 0, 0, 0, 1, 256, sizeof(float), 8, 2}, +{"f16_topk_1280_512", 1, reinterpret_cast(LaunchTMRGSORT_f16_topk_1280_512), nullptr, nullptr, nullptr, 1, 1280, 0, 0, 0, 0, 1, 512, sizeof(uint16_t), 8, 4}, +}; +static constexpr size_t kNumCases = sizeof(kCases) / sizeof(kCases[0]); + +static int RunCase(const TestCase &tc, aclrtStream stream) { + int rc = 0; + std::string caseDir = std::string("./") + tc.name; + + // Single-list case (Format1) + if (tc.listNum == 1) { + // srcCols/dstCols are in ELEMENTS, need to convert to STRUCTURE count + // elemsPerStruct = structSize / elemSize (2 for f32, 4 for f16) + size_t srcStructs = tc.srcCols / tc.elemsPerStruct; + size_t dstStructs = tc.dstCols / tc.elemsPerStruct; + + // File sizes in bytes + size_t srcFileSize = tc.srcRows * srcStructs * tc.structSize; + size_t dstFileSize = tc.dstRows * dstStructs * tc.structSize; + + std::printf("[INFO] === case: %s (src=%zux%zu, dst=%zux%zu) ===\n", + tc.name, tc.srcRows, tc.srcCols, tc.dstRows, tc.dstCols); + + void *srcHost = nullptr, *dstHost = nullptr; + void *srcDevice = nullptr, *dstDevice = nullptr; + + aclrtMallocHost((void **)(&srcHost), srcFileSize); + aclrtMallocHost((void **)(&dstHost), dstFileSize); + + aclrtMalloc((void **)&srcDevice, srcFileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc((void **)&dstDevice, dstFileSize, ACL_MEM_MALLOC_HUGE_FIRST); + + if (!ReadFile((caseDir + "/input0.bin").c_str(), srcFileSize, srcHost, srcFileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input0.bin\n", caseDir.c_str()); + rc = 1; + } + + if (rc == 0) { + aclrtMemcpy(srcDevice, srcFileSize, srcHost, srcFileSize, ACL_MEMCPY_HOST_TO_DEVICE); + + tc.launch(srcDevice, dstDevice, stream); + + aclrtSynchronizeStream(stream); + aclrtMemcpy(dstHost, dstFileSize, dstDevice, dstFileSize, ACL_MEMCPY_DEVICE_TO_HOST); + } + + if (rc == 0 && !WriteFile((caseDir + "/output.bin").c_str(), dstHost, dstFileSize)) { + std::fprintf(stderr, "[ERROR] failed to write %s/output.bin\n", caseDir.c_str()); + rc = 1; + } + + if (srcDevice != nullptr) + aclrtFree(srcDevice); + if (dstDevice != nullptr) + aclrtFree(dstDevice); + if (srcHost != nullptr) + aclrtFreeHost(srcHost); + if (dstHost != nullptr) + aclrtFreeHost(dstHost); + + if (rc == 0) + std::printf("[INFO] case %s done\n", tc.name); + } + + // Multi-list case (Format2) + else if (tc.listNum == 2) { + // For 2-list: src0, src1, dst + // srcCols0, srcCols1 are in ELEMENTS, dstCols in ELEMENTS + // elemsPerStruct = structSize / elemSize (2 for f32, 4 for f16) + size_t src0Structs = tc.srcCols0 / tc.elemsPerStruct; + size_t src1Structs = tc.srcCols1 / tc.elemsPerStruct; + size_t dstStructs = tc.dstCols / tc.elemsPerStruct; + + size_t src0FileSize = tc.srcRows * src0Structs * tc.structSize; + size_t src1FileSize = tc.srcRows * src1Structs * tc.structSize; + size_t dstFileSize = tc.dstRows * dstStructs * tc.structSize; + + std::printf("[INFO] === case: %s (src0=%zux%zu, src1=%zux%zu, dst=%zux%zu) ===\n", + tc.name, tc.srcRows, tc.srcCols0, tc.srcRows, tc.srcCols1, tc.dstRows, tc.dstCols); + + void *src0Host = nullptr, *src1Host = nullptr, *dstHost = nullptr; + void *src0Device = nullptr, *src1Device = nullptr, *dstDevice = nullptr; + + aclrtMallocHost((void **)(&src0Host), src0FileSize); + aclrtMallocHost((void **)(&src1Host), src1FileSize); + aclrtMallocHost((void **)(&dstHost), dstFileSize); + + aclrtMalloc((void **)&src0Device, src0FileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc((void **)&src1Device, src1FileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc((void **)&dstDevice, dstFileSize, ACL_MEM_MALLOC_HUGE_FIRST); + + // Read input0.bin and input1.bin + if (!ReadFile((caseDir + "/input0.bin").c_str(), src0FileSize, src0Host, src0FileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input0.bin\n", caseDir.c_str()); + rc = 1; + } + if (rc == 0 && !ReadFile((caseDir + "/input1.bin").c_str(), src1FileSize, src1Host, src1FileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input1.bin\n", caseDir.c_str()); + rc = 1; + } + + if (rc == 0) { + aclrtMemcpy(src0Device, src0FileSize, src0Host, src0FileSize, ACL_MEMCPY_HOST_TO_DEVICE); + aclrtMemcpy(src1Device, src1FileSize, src1Host, src1FileSize, ACL_MEMCPY_HOST_TO_DEVICE); + + tc.launch2(src0Device, src1Device, dstDevice, stream); + + aclrtSynchronizeStream(stream); + aclrtMemcpy(dstHost, dstFileSize, dstDevice, dstFileSize, ACL_MEMCPY_DEVICE_TO_HOST); + } + + if (rc == 0 && !WriteFile((caseDir + "/output.bin").c_str(), dstHost, dstFileSize)) { + std::fprintf(stderr, "[ERROR] failed to write %s/output.bin\n", caseDir.c_str()); + rc = 1; + } + + if (src0Device != nullptr) aclrtFree(src0Device); + if (src1Device != nullptr) aclrtFree(src1Device); + if (dstDevice != nullptr) aclrtFree(dstDevice); + if (src0Host != nullptr) aclrtFreeHost(src0Host); + if (src1Host != nullptr) aclrtFreeHost(src1Host); + if (dstHost != nullptr) aclrtFreeHost(dstHost); + + if (rc == 0) + std::printf("[INFO] case %s done\n", tc.name); + } + + // 3-list case (Format3) + else if (tc.listNum == 3) { + size_t src0Structs = tc.srcCols0 / tc.elemsPerStruct; + size_t src1Structs = tc.srcCols1 / tc.elemsPerStruct; + size_t src2Structs = tc.srcCols2 / tc.elemsPerStruct; + size_t dstStructs = tc.dstCols / tc.elemsPerStruct; + + size_t src0FileSize = tc.srcRows * src0Structs * tc.structSize; + size_t src1FileSize = tc.srcRows * src1Structs * tc.structSize; + size_t src2FileSize = tc.srcRows * src2Structs * tc.structSize; + size_t dstFileSize = tc.dstRows * dstStructs * tc.structSize; + + std::printf("[INFO] === case: %s (src0=%zux%zu, src1=%zux%zu, src2=%zux%zu, dst=%zux%zu) ===\n", + tc.name, tc.srcRows, tc.srcCols0, tc.srcRows, tc.srcCols1, + tc.srcRows, tc.srcCols2, tc.dstRows, tc.dstCols); + + void *src0Host = nullptr, *src1Host = nullptr, *src2Host = nullptr, *dstHost = nullptr; + void *src0Device = nullptr, *src1Device = nullptr, *src2Device = nullptr, *dstDevice = nullptr; + + aclrtMallocHost((void **)(&src0Host), src0FileSize); + aclrtMallocHost((void **)(&src1Host), src1FileSize); + aclrtMallocHost((void **)(&src2Host), src2FileSize); + aclrtMallocHost((void **)(&dstHost), dstFileSize); + + aclrtMalloc((void **)&src0Device, src0FileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc((void **)&src1Device, src1FileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc((void **)&src2Device, src2FileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc((void **)&dstDevice, dstFileSize, ACL_MEM_MALLOC_HUGE_FIRST); + + if (!ReadFile((caseDir + "/input0.bin").c_str(), src0FileSize, src0Host, src0FileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input0.bin\n", caseDir.c_str()); + rc = 1; + } + if (rc == 0 && !ReadFile((caseDir + "/input1.bin").c_str(), src1FileSize, src1Host, src1FileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input1.bin\n", caseDir.c_str()); + rc = 1; + } + if (rc == 0 && !ReadFile((caseDir + "/input2.bin").c_str(), src2FileSize, src2Host, src2FileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input2.bin\n", caseDir.c_str()); + rc = 1; + } + + if (rc == 0) { + aclrtMemcpy(src0Device, src0FileSize, src0Host, src0FileSize, ACL_MEMCPY_HOST_TO_DEVICE); + aclrtMemcpy(src1Device, src1FileSize, src1Host, src1FileSize, ACL_MEMCPY_HOST_TO_DEVICE); + aclrtMemcpy(src2Device, src2FileSize, src2Host, src2FileSize, ACL_MEMCPY_HOST_TO_DEVICE); + + tc.launch3(src0Device, src1Device, src2Device, dstDevice, stream); + + aclrtSynchronizeStream(stream); + aclrtMemcpy(dstHost, dstFileSize, dstDevice, dstFileSize, ACL_MEMCPY_DEVICE_TO_HOST); + } + + if (rc == 0 && !WriteFile((caseDir + "/output.bin").c_str(), dstHost, dstFileSize)) { + std::fprintf(stderr, "[ERROR] failed to write %s/output.bin\n", caseDir.c_str()); + rc = 1; + } + + if (src0Device != nullptr) aclrtFree(src0Device); + if (src1Device != nullptr) aclrtFree(src1Device); + if (src2Device != nullptr) aclrtFree(src2Device); + if (dstDevice != nullptr) aclrtFree(dstDevice); + if (src0Host != nullptr) aclrtFreeHost(src0Host); + if (src1Host != nullptr) aclrtFreeHost(src1Host); + if (src2Host != nullptr) aclrtFreeHost(src2Host); + if (dstHost != nullptr) aclrtFreeHost(dstHost); + + if (rc == 0) + std::printf("[INFO] case %s done\n", tc.name); + } + + // 4-list case (Format4) + else if (tc.listNum == 4) { + size_t src0Structs = tc.srcCols0 / tc.elemsPerStruct; + size_t src1Structs = tc.srcCols1 / tc.elemsPerStruct; + size_t src2Structs = tc.srcCols2 / tc.elemsPerStruct; + size_t src3Structs = tc.srcCols3 / tc.elemsPerStruct; + size_t dstStructs = tc.dstCols / tc.elemsPerStruct; + + size_t src0FileSize = tc.srcRows * src0Structs * tc.structSize; + size_t src1FileSize = tc.srcRows * src1Structs * tc.structSize; + size_t src2FileSize = tc.srcRows * src2Structs * tc.structSize; + size_t src3FileSize = tc.srcRows * src3Structs * tc.structSize; + size_t dstFileSize = tc.dstRows * dstStructs * tc.structSize; + + std::printf("[INFO] === case: %s (src0=%zux%zu, src1=%zux%zu, src2=%zux%zu, src3=%zux%zu, dst=%zux%zu) ===\n", + tc.name, tc.srcRows, tc.srcCols0, tc.srcRows, tc.srcCols1, + tc.srcRows, tc.srcCols2, tc.srcRows, tc.srcCols3, + tc.dstRows, tc.dstCols); + + void *src0Host = nullptr, *src1Host = nullptr, *src2Host = nullptr, *src3Host = nullptr, *dstHost = nullptr; + void *src0Device = nullptr, *src1Device = nullptr, *src2Device = nullptr, *src3Device = nullptr, *dstDevice = nullptr; + + aclrtMallocHost((void **)(&src0Host), src0FileSize); + aclrtMallocHost((void **)(&src1Host), src1FileSize); + aclrtMallocHost((void **)(&src2Host), src2FileSize); + aclrtMallocHost((void **)(&src3Host), src3FileSize); + aclrtMallocHost((void **)(&dstHost), dstFileSize); + + aclrtMalloc((void **)&src0Device, src0FileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc((void **)&src1Device, src1FileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc((void **)&src2Device, src2FileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc((void **)&src3Device, src3FileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc((void **)&dstDevice, dstFileSize, ACL_MEM_MALLOC_HUGE_FIRST); + + if (!ReadFile((caseDir + "/input0.bin").c_str(), src0FileSize, src0Host, src0FileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input0.bin\n", caseDir.c_str()); + rc = 1; + } + if (rc == 0 && !ReadFile((caseDir + "/input1.bin").c_str(), src1FileSize, src1Host, src1FileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input1.bin\n", caseDir.c_str()); + rc = 1; + } + if (rc == 0 && !ReadFile((caseDir + "/input2.bin").c_str(), src2FileSize, src2Host, src2FileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input2.bin\n", caseDir.c_str()); + rc = 1; + } + if (rc == 0 && !ReadFile((caseDir + "/input3.bin").c_str(), src3FileSize, src3Host, src3FileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input3.bin\n", caseDir.c_str()); + rc = 1; + } + + if (rc == 0) { + aclrtMemcpy(src0Device, src0FileSize, src0Host, src0FileSize, ACL_MEMCPY_HOST_TO_DEVICE); + aclrtMemcpy(src1Device, src1FileSize, src1Host, src1FileSize, ACL_MEMCPY_HOST_TO_DEVICE); + aclrtMemcpy(src2Device, src2FileSize, src2Host, src2FileSize, ACL_MEMCPY_HOST_TO_DEVICE); + aclrtMemcpy(src3Device, src3FileSize, src3Host, src3FileSize, ACL_MEMCPY_HOST_TO_DEVICE); + + tc.launch4(src0Device, src1Device, src2Device, src3Device, dstDevice, stream); + + aclrtSynchronizeStream(stream); + aclrtMemcpy(dstHost, dstFileSize, dstDevice, dstFileSize, ACL_MEMCPY_DEVICE_TO_HOST); + } + + if (rc == 0 && !WriteFile((caseDir + "/output.bin").c_str(), dstHost, dstFileSize)) { + std::fprintf(stderr, "[ERROR] failed to write %s/output.bin\n", caseDir.c_str()); + rc = 1; + } + + if (src0Device != nullptr) aclrtFree(src0Device); + if (src1Device != nullptr) aclrtFree(src1Device); + if (src2Device != nullptr) aclrtFree(src2Device); + if (src3Device != nullptr) aclrtFree(src3Device); + if (dstDevice != nullptr) aclrtFree(dstDevice); + if (src0Host != nullptr) aclrtFreeHost(src0Host); + if (src1Host != nullptr) aclrtFreeHost(src1Host); + if (src2Host != nullptr) aclrtFreeHost(src2Host); + if (src3Host != nullptr) aclrtFreeHost(src3Host); + if (dstHost != nullptr) aclrtFreeHost(dstHost); + + if (rc == 0) + std::printf("[INFO] case %s done\n", tc.name); + } + + else { + std::fprintf(stderr, "[ERROR] Unsupported listNum=%d for case %s\n", tc.listNum, tc.name); + rc = 1; + } + + return rc; +} + +int main(int argc, char *argv[]) { + const char *caseFilter = (argc > 1) ? argv[1] : nullptr; + + int rc = 0; + int deviceId = 0; + aclrtStream stream = nullptr; + + aclInit(nullptr); + if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { + deviceId = std::atoi(envDevice); + } + aclrtSetDevice(deviceId); + aclrtCreateStream(&stream); + + for (size_t i = 0; i < kNumCases; ++i) { + if (caseFilter != nullptr && std::strcmp(kCases[i].name, caseFilter) != 0) { + continue; + } + int ret = RunCase(kCases[i], stream); + if (ret != 0) { + std::fprintf(stderr, "[ERROR] case %s failed\n", kCases[i].name); + rc = 1; + break; + } + } + + if (stream != nullptr) + aclrtDestroyStream(stream); + aclrtResetDevice(deviceId); + aclFinalize(); + + return rc; +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmrgsort/tmrgsort.pto b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmrgsort/tmrgsort.pto new file mode 100644 index 000000000..cf27d0503 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmrgsort/tmrgsort.pto @@ -0,0 +1,176 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// TileLang ST kernels for pto.tmrgsort Format1: single list internal block sorting. +// Input is divided into 4 blocks, each block sorted, then merged. +// Output: interleaved (sorted_value, original_index) pairs. +// Compiled by ptoas --enable-insert-sync --enable-tile-op-expand --vpto-emit-hivm-llvm +// to produce LLVM IR. + +module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { + // Case: f32_single_1x256_b64 - transplanted from pto-isa case_single1 + // TMrgsortSingle + // cols=256 float elements = 128 structures + // block_len=64 float elements = 32 structures/block + func.func @TMRGSORT_f32_single_1x256_b64(%src_ptr: !pto.ptr, %dst_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c64 = arith.constant 64 : i32 // block_len (float elements) + %c256 = arith.constant 256 : index // kGCols = total float elements + + %src_view = pto.make_tensor_view %src_ptr, + shape = [%c1, %c1, %c1, %c1, %c256], + strides = [%c256, %c256, %c256, %c256, %c1] + : !pto.tensor_view<1x1x1x1x256xf32> + %dst_view = pto.make_tensor_view %dst_ptr, + shape = [%c1, %c1, %c1, %c1, %c256], + strides = [%c256, %c256, %c256, %c256, %c1] + : !pto.tensor_view<1x1x1x1x256xf32> + + %src_part = pto.partition_view %src_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c1, %c256] + : !pto.tensor_view<1x1x1x1x256xf32> -> !pto.partition_tensor_view<1x1x1x1x256xf32> + %dst_part = pto.partition_view %dst_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c1, %c256] + : !pto.tensor_view<1x1x1x1x256xf32> -> !pto.partition_tensor_view<1x1x1x1x256xf32> + + %src_tile = pto.alloc_tile + : !pto.tile_buf + %dst_tile = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%src_part : !pto.partition_tensor_view<1x1x1x1x256xf32>) + outs(%src_tile : !pto.tile_buf) + + pto.tmrgsort ins(%src_tile, %c64 : !pto.tile_buf, i32) + outs(%dst_tile : !pto.tile_buf) + + pto.tstore ins(%dst_tile : !pto.tile_buf) + outs(%dst_part : !pto.partition_tensor_view<1x1x1x1x256xf32>) + return + } + + // Case: f32_single_1x320_b64 - transplanted from pto-isa case_single2 + // TMrgsortSingle + // kGCols=320 (global memory), kTCols=256 (effective tile region) + // cols=256 float elements = 128 structures + // block_len=64 float elements = 32 structures/block + +func.func @TMRGSORT_f16_topk_1280_512(%src_ptr: !pto.ptr, %dst_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c64 = arith.constant 64 : i32 // blockLen iteration 1 (f16 elements) + %c256 = arith.constant 256 : i32 // blockLen iteration 2 (f16 elements) + %c256_idx = arith.constant 256 : index // block1 size in elements (64 structures * 4) + %c512 = arith.constant 512 : index // block0 size (256 structures * 4), also dst cols (topk) + %c768 = arith.constant 768 : index // block1 offset (512 + 256) + %c1024 = arith.constant 1024 : index // block0 end offset (for partition) + %c1280 = arith.constant 1280 : index // src cols + + %src_view = pto.make_tensor_view %src_ptr, + shape = [%c1, %c1, %c1, %c1, %c1280], + strides = [%c1280, %c1280, %c1280, %c1280, %c1] + : !pto.tensor_view<1x1x1x1x1280xf16> + %dst_view = pto.make_tensor_view %dst_ptr, + shape = [%c1, %c1, %c1, %c1, %c512], + strides = [%c512, %c512, %c512, %c512, %c1] + : !pto.tensor_view<1x1x1x1x512xf16> + + %src_part = pto.partition_view %src_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c1, %c1280] + : !pto.tensor_view<1x1x1x1x1280xf16> -> !pto.partition_tensor_view<1x1x1x1x1280xf16> + %dst_part = pto.partition_view %dst_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c1, %c512] + : !pto.tensor_view<1x1x1x1x512xf16> -> !pto.partition_tensor_view<1x1x1x1x512xf16> + + // Allocate tiles for iterative merge + %src_tile = pto.alloc_tile + : !pto.tile_buf + %tmp_tile = pto.alloc_tile + : !pto.tile_buf + %dst_tile = pto.alloc_tile + : !pto.tile_buf + // Tiles for Format2 merge + // block0: 256 structures = 1024 f16 elements + // block1: 64 structures = 256 f16 elements + %block0_tile = pto.alloc_tile + : !pto.tile_buf + %block1_tile = pto.alloc_tile + : !pto.tile_buf + %merge_tmp_tile = pto.alloc_tile + : !pto.tile_buf + %merge_dst_tile = pto.alloc_tile + : !pto.tile_buf + %ex_vec = arith.constant dense<0> : vector<4xi16> + + // Load unsorted data to src_tile + pto.tload ins(%src_part : !pto.partition_tensor_view<1x1x1x1x1280xf16>) + outs(%src_tile : !pto.tile_buf) + + // Iteration 1: blockLen=64 f16 elements (=16 structures) + // After: 5 blocks of 64 structures each + pto.tmrgsort ins(%src_tile, %c64 : + !pto.tile_buf, + i32) + outs(%tmp_tile : !pto.tile_buf) + + // Copy result back to src_tile for next iteration + pto.tmov ins(%tmp_tile : !pto.tile_buf) + outs(%src_tile : !pto.tile_buf) + + // Iteration 2: blockLen=256 f16 elements (=64 structures) + // After: 1 block of 256 structures + 1 tail of 64 structures + pto.tmrgsort ins(%src_tile, %c256 : + !pto.tile_buf, + i32) + outs(%tmp_tile : !pto.tile_buf) + + // Store tmp_tile back to src memory (reuse as intermediate buffer) + pto.tstore ins(%tmp_tile : !pto.tile_buf) + outs(%src_part : !pto.partition_tensor_view<1x1x1x1x1280xf16>) + + // Load block0 (0-1023) into block0_tile (256 structures = 1024 f16 elements) + %block0_part = pto.partition_view %src_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c1, %c1024] + : !pto.tensor_view<1x1x1x1x1280xf16> -> !pto.partition_tensor_view<1x1x1x1x1024xf16> + pto.tload ins(%block0_part : !pto.partition_tensor_view<1x1x1x1x1024xf16>) + outs(%block0_tile : !pto.tile_buf) + + // Load block1 (1024-1279) into block1_tile (64 structures = 256 f16 elements) + %block1_part = pto.partition_view %src_view, + offsets = [%c0, %c0, %c0, %c0, %c1024], + sizes = [%c1, %c1, %c1, %c1, %c256_idx] + : !pto.tensor_view<1x1x1x1x1280xf16> -> !pto.partition_tensor_view<1x1x1x1x256xf16> + pto.tload ins(%block1_part : !pto.partition_tensor_view<1x1x1x1x256xf16>) + outs(%block1_tile : !pto.tile_buf) + + // Format2 merge: merge block0 (256 structures) and block1 (64 structures) + // Output: 320 sorted structures, dst takes topk=128 structures (512 f16 elems) + pto.tmrgsort ins(%block0_tile, %block1_tile, %merge_tmp_tile {exhausted = false} : + !pto.tile_buf, + !pto.tile_buf, + !pto.tile_buf) + outs(%merge_dst_tile, %ex_vec : + !pto.tile_buf, + vector<4xi16>) + + // Take top-k from merged result (merge_dst_tile already has topk=128 structures) + pto.tmov ins(%merge_dst_tile : !pto.tile_buf) + outs(%dst_tile : !pto.tile_buf) + + // Store top-k result + pto.tstore ins(%dst_tile : !pto.tile_buf) + outs(%dst_part : !pto.partition_tensor_view<1x1x1x1x512xf16>) + return + } +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmul/CMakeLists.txt b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmul/CMakeLists.txt new file mode 100644 index 000000000..d5f713730 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmul/CMakeLists.txt @@ -0,0 +1,9 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +pto_tilelang_vec_st(tmul) diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmul/cases.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmul/cases.py new file mode 100644 index 000000000..145e8184d --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmul/cases.py @@ -0,0 +1,47 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +"""Single source of truth for tmul ST test cases. + +Each case defines: + - name: case identifier, used as subdirectory name and by main.cpp kCases[]. + - dtype: numpy dtype (e.g. np.float32). + - shape: (rows, cols) — allocated tile dimensions. + - valid_shape: (valid_rows, valid_cols) — effective computation region. + - eps: tolerance for numpy.allclose (atol and rtol). + +gen_data.py and compare.py both import this list to avoid redundant definitions. +""" + +import numpy as np + +CASES = [ + { + "name": "f32_16x64", + "dtype": np.float32, + "shape": (16, 64), + "valid_shape": (16, 64), + "eps": 1e-6, + }, + { + "name": "f32_32x32", + "dtype": np.float32, + "shape": (32, 32), + "valid_shape": (32, 32), + "eps": 1e-6, + }, +] + +_SMOKE_CASE_NAMES = ['f32_16x64', 'f32_32x32'] +_SMOKE_CASE_NAME_SET = set(_SMOKE_CASE_NAMES) +_missing = [name for name in _SMOKE_CASE_NAMES if name not in {case["name"] for case in CASES}] +if _missing: + raise RuntimeError("unknown smoke case(s): " + ", ".join(_missing)) +CASES = [case for case in CASES if case["name"] in _SMOKE_CASE_NAME_SET] diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmul/compare.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmul/compare.py new file mode 100644 index 000000000..6a4d5d1aa --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmul/compare.py @@ -0,0 +1,48 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import os +import sys +import numpy as np + +from cases import CASES +from st_common import result_cmp, style_fail, style_pass, validate_cases + + +def main(): + validate_cases(CASES) + case_filter = sys.argv[1] if len(sys.argv) > 1 else None + + all_passed = True + for case in CASES: + if case_filter is not None and case["name"] != case_filter: + continue + + case_dir = case["name"] + shape = case["shape"] + vr, vc = case["valid_shape"] + + golden = np.fromfile(os.path.join(case_dir, "golden.bin"), dtype=case["dtype"]).reshape(shape) + output = np.fromfile(os.path.join(case_dir, "output.bin"), dtype=case["dtype"]).reshape(shape) + + ok = result_cmp(golden[:vr, :vc], output[:vr, :vc], case["eps"]) + if ok: + print(style_pass(f"[INFO] {case['name']}: compare passed")) + else: + print(style_fail(f"[ERROR] {case['name']}: compare failed")) + all_passed = False + + if not all_passed: + sys.exit(2) + print(style_pass("[INFO] all cases passed")) + + +if __name__ == "__main__": + main() diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmul/gen_data.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmul/gen_data.py new file mode 100644 index 000000000..72190f37e --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmul/gen_data.py @@ -0,0 +1,32 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import numpy as np +from cases import CASES +from st_common import validate_cases, setup_case_rng, save_case_data + +validate_cases(CASES) + +for case in CASES: + setup_case_rng(case) + + dtype = case["dtype"] + shape = case["shape"] + valid_shape = case["valid_shape"] + + input1 = np.random.randint(1, 10, size=shape).astype(dtype) + input2 = np.random.randint(1, 10, size=shape).astype(dtype) + + golden = np.zeros(shape, dtype=dtype) + vr, vc = valid_shape + golden[:vr, :vc] = (input1[:vr, :vc] * input2[:vr, :vc]).astype(dtype, copy=False) + + save_case_data(case["name"], {"input1": input1, "input2": input2, "golden": golden}) + print(f"[INFO] gen_data: {case['name']} shape={shape} valid_shape={valid_shape} dtype={dtype.__name__}") diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmul/launch.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmul/launch.cpp new file mode 100644 index 000000000..8fd8184ff --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmul/launch.cpp @@ -0,0 +1,28 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +#include + +#ifndef AICORE +#define AICORE [aicore] +#endif + +// Case 0: f32 16x64 + +extern "C" __global__ AICORE void TMUL_f32_16x64(__gm__ float *a, __gm__ float *b, __gm__ float *c); +extern "C" __global__ AICORE void TMUL_f32_32x32(__gm__ float *a, __gm__ float *b, __gm__ float *c); + +void LaunchTMUL_f32_16x64(float *a, float *b, float *c, void *stream) { + TMUL_f32_16x64<<<1, nullptr, stream>>>((__gm__ float *)a, (__gm__ float *)b, (__gm__ float *)c); +} + + + +void LaunchTMUL_f32_32x32(float *a, float *b, float *c, void *stream) { + TMUL_f32_32x32<<<1, nullptr, stream>>>((__gm__ float *)a, (__gm__ float *)b, (__gm__ float *)c); +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmul/main.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmul/main.cpp new file mode 100644 index 000000000..5bf033def --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmul/main.cpp @@ -0,0 +1,145 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// Host driver for TileLang tmul ST — case-table driven. +// Each case launches a different kernel variant, reads/writes from per-case subdirectory. +// Numerical comparison is done externally by compare.py. + +#include "acl/acl.h" +#include "test_common.h" +#include +#include +#include +#include +#include +#include + +using namespace PtoTestCommon; + +// Kernel launch wrappers (defined in launch.cpp) +void LaunchTMUL_f32_16x64(float *a, float *b, float *c, void *stream); +void LaunchTMUL_f32_32x32(float *a, float *b, float *c, void *stream); + +using LaunchFn = void (*)(float *, float *, float *, void *); + +struct TestCase { + const char *name; + LaunchFn launch; + size_t rows; // allocated tile rows + size_t cols; // allocated tile cols + size_t validRows; // effective computation rows (<= rows) + size_t validCols; // effective computation cols (<= cols) + size_t elemSize; // bytes per element +}; + +static const TestCase kCases[] = { +{"f32_16x64", LaunchTMUL_f32_16x64, 16, 64, 16, 64, sizeof(float)}, +{"f32_32x32", LaunchTMUL_f32_32x32, 32, 32, 32, 32, sizeof(float)}, +}; +static constexpr size_t kNumCases = sizeof(kCases) / sizeof(kCases[0]); + +static int RunCase(const TestCase &tc, int deviceId, aclrtStream stream) { + int rc = 0; + const size_t elemCount = tc.rows * tc.cols; + const size_t fileSize = elemCount * tc.elemSize; + + std::printf("[INFO] === case: %s (shape=%zux%zu, valid=%zux%zu) ===\n", + tc.name, tc.rows, tc.cols, tc.validRows, tc.validCols); + + // Per-case data directory + std::string caseDir = std::string("./") + tc.name; + size_t src0FileSize = fileSize; + size_t src1FileSize = fileSize; + + float *src0Host = nullptr, *src1Host = nullptr, *dstHost = nullptr; + float *src0Device = nullptr, *src1Device = nullptr, *dstDevice = nullptr; + + aclrtMallocHost((void **)(&src0Host), fileSize); + aclrtMallocHost((void **)(&src1Host), fileSize); + aclrtMallocHost((void **)(&dstHost), fileSize); + + aclrtMalloc((void **)&src0Device, fileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc((void **)&src1Device, fileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc((void **)&dstDevice, fileSize, ACL_MEM_MALLOC_HUGE_FIRST); + + if (!ReadFile((caseDir + "/input1.bin").c_str(), src0FileSize, src0Host, fileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input1.bin\n", caseDir.c_str()); + rc = 1; + } + if (rc == 0 && !ReadFile((caseDir + "/input2.bin").c_str(), src1FileSize, src1Host, fileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input2.bin\n", caseDir.c_str()); + rc = 1; + } + + if (rc == 0) { + aclrtMemcpy(src0Device, fileSize, src0Host, fileSize, ACL_MEMCPY_HOST_TO_DEVICE); + aclrtMemcpy(src1Device, fileSize, src1Host, fileSize, ACL_MEMCPY_HOST_TO_DEVICE); + + tc.launch(src0Device, src1Device, dstDevice, stream); + + aclrtSynchronizeStream(stream); + aclrtMemcpy(dstHost, fileSize, dstDevice, fileSize, ACL_MEMCPY_DEVICE_TO_HOST); + } + + if (rc == 0 && !WriteFile((caseDir + "/output.bin").c_str(), dstHost, fileSize)) { + std::fprintf(stderr, "[ERROR] failed to write %s/output.bin\n", caseDir.c_str()); + rc = 1; + } + + if (src0Device != nullptr) + aclrtFree(src0Device); + if (src1Device != nullptr) + aclrtFree(src1Device); + if (dstDevice != nullptr) + aclrtFree(dstDevice); + if (src0Host != nullptr) + aclrtFreeHost(src0Host); + if (src1Host != nullptr) + aclrtFreeHost(src1Host); + if (dstHost != nullptr) + aclrtFreeHost(dstHost); + + if (rc == 0) + std::printf("[INFO] case %s done\n", tc.name); + return rc; +} + +int main(int argc, char *argv[]) { + // Optional case filter: ./tmul [case_name] + const char *caseFilter = (argc > 1) ? argv[1] : nullptr; + + int rc = 0; + int deviceId = 0; + aclrtStream stream = nullptr; + + aclInit(nullptr); + if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { + deviceId = std::atoi(envDevice); + } + aclrtSetDevice(deviceId); + aclrtCreateStream(&stream); + + for (size_t i = 0; i < kNumCases; ++i) { + if (caseFilter != nullptr && std::strcmp(kCases[i].name, caseFilter) != 0) { + continue; + } + int ret = RunCase(kCases[i], deviceId, stream); + if (ret != 0) { + std::fprintf(stderr, "[ERROR] case %s failed\n", kCases[i].name); + rc = 1; + break; + } + } + + if (stream != nullptr) + aclrtDestroyStream(stream); + aclrtResetDevice(deviceId); + aclFinalize(); + + return rc; +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmul/tmul.pto b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmul/tmul.pto new file mode 100644 index 000000000..da7c326e0 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmul/tmul.pto @@ -0,0 +1,124 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// TileLang ST kernels for pto.tmul: tload(a) + tload(b) + tmul(a,b)->c + tstore(c). +// Multiple cases with different shapes in a single module. +// Compiled by ptoas --enable-insert-sync --enable-tile-op-expand --pto-backend=vpto +// to produce a fatobj object. + +module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { + // Case 0: f32 16x64 (1024 elements) + func.func @TMUL_f32_16x64(%a_ptr: !pto.ptr, %b_ptr: !pto.ptr, %c_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c16 = arith.constant 16 : index + %c64 = arith.constant 64 : index + %c1024 = arith.constant 1024 : index + + %a_view = pto.make_tensor_view %a_ptr, + shape = [%c1, %c1, %c1, %c16, %c64], + strides = [%c1024, %c1024, %c1024, %c64, %c1] + : !pto.tensor_view<1x1x1x16x64xf32> + %b_view = pto.make_tensor_view %b_ptr, + shape = [%c1, %c1, %c1, %c16, %c64], + strides = [%c1024, %c1024, %c1024, %c64, %c1] + : !pto.tensor_view<1x1x1x16x64xf32> + %c_view = pto.make_tensor_view %c_ptr, + shape = [%c1, %c1, %c1, %c16, %c64], + strides = [%c1024, %c1024, %c1024, %c64, %c1] + : !pto.tensor_view<1x1x1x16x64xf32> + + %a_part = pto.partition_view %a_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c16, %c64] + : !pto.tensor_view<1x1x1x16x64xf32> -> !pto.partition_tensor_view<1x1x1x16x64xf32> + %b_part = pto.partition_view %b_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c16, %c64] + : !pto.tensor_view<1x1x1x16x64xf32> -> !pto.partition_tensor_view<1x1x1x16x64xf32> + %c_part = pto.partition_view %c_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c16, %c64] + : !pto.tensor_view<1x1x1x16x64xf32> -> !pto.partition_tensor_view<1x1x1x16x64xf32> + + %a = pto.alloc_tile + : !pto.tile_buf + %b = pto.alloc_tile + : !pto.tile_buf + %c = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%a_part : !pto.partition_tensor_view<1x1x1x16x64xf32>) + outs(%a : !pto.tile_buf) + pto.tload ins(%b_part : !pto.partition_tensor_view<1x1x1x16x64xf32>) + outs(%b : !pto.tile_buf) + + pto.tmul ins(%a, %b : !pto.tile_buf, + !pto.tile_buf) + outs(%c : !pto.tile_buf) + + pto.tstore ins(%c : !pto.tile_buf) + outs(%c_part : !pto.partition_tensor_view<1x1x1x16x64xf32>) + return + } + + // Case 1: f32 32x32 (1024 elements) + + func.func @TMUL_f32_32x32(%a_ptr: !pto.ptr, %b_ptr: !pto.ptr, %c_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c32 = arith.constant 32 : index + %c1024 = arith.constant 1024 : index + + %a_view = pto.make_tensor_view %a_ptr, + shape = [%c1, %c1, %c1, %c32, %c32], + strides = [%c1024, %c1024, %c1024, %c32, %c1] + : !pto.tensor_view<1x1x1x32x32xf32> + %b_view = pto.make_tensor_view %b_ptr, + shape = [%c1, %c1, %c1, %c32, %c32], + strides = [%c1024, %c1024, %c1024, %c32, %c1] + : !pto.tensor_view<1x1x1x32x32xf32> + %c_view = pto.make_tensor_view %c_ptr, + shape = [%c1, %c1, %c1, %c32, %c32], + strides = [%c1024, %c1024, %c1024, %c32, %c1] + : !pto.tensor_view<1x1x1x32x32xf32> + + %a_part = pto.partition_view %a_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c32, %c32] + : !pto.tensor_view<1x1x1x32x32xf32> -> !pto.partition_tensor_view<1x1x1x32x32xf32> + %b_part = pto.partition_view %b_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c32, %c32] + : !pto.tensor_view<1x1x1x32x32xf32> -> !pto.partition_tensor_view<1x1x1x32x32xf32> + %c_part = pto.partition_view %c_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c32, %c32] + : !pto.tensor_view<1x1x1x32x32xf32> -> !pto.partition_tensor_view<1x1x1x32x32xf32> + + %a = pto.alloc_tile + : !pto.tile_buf + %b = pto.alloc_tile + : !pto.tile_buf + %c = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%a_part : !pto.partition_tensor_view<1x1x1x32x32xf32>) + outs(%a : !pto.tile_buf) + pto.tload ins(%b_part : !pto.partition_tensor_view<1x1x1x32x32xf32>) + outs(%b : !pto.tile_buf) + + pto.tmul ins(%a, %b : !pto.tile_buf, + !pto.tile_buf) + outs(%c : !pto.tile_buf) + + pto.tstore ins(%c : !pto.tile_buf) + outs(%c_part : !pto.partition_tensor_view<1x1x1x32x32xf32>) + return + } +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmuls/CMakeLists.txt b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmuls/CMakeLists.txt new file mode 100644 index 000000000..49ba8cd84 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmuls/CMakeLists.txt @@ -0,0 +1,9 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +pto_tilelang_vec_st(tmuls) diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmuls/cases.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmuls/cases.py new file mode 100644 index 000000000..6c59ec275 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmuls/cases.py @@ -0,0 +1,76 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +"""Single source of truth for tmuls ST test cases. + +Shapes and dtype match testcase/tadds (C++ GTest suite): + case1: float, 32x64, valid 32x64 + case2: float16, 63x64, valid 63x64 + case3: int32, 31x128, valid 31x128 + case4: int16, 15x192, valid 15x192 + case5: float, 7x448, valid 7x448 + case6: float, 256x16, valid 256x16 + +gen_data.py and compare.py both import this list to avoid redundant definitions. +""" + +import numpy as np + +CASES = [ + { + "name": "f32_32x64", + "dtype": np.float32, + "shape": (32, 64), + "valid_shape": (32, 64), + "eps": 1e-6, + }, + { + "name": "f16_63x64", + "dtype": np.float16, + "shape": (63, 64), + "valid_shape": (63, 64), + "eps": 1e-3, + }, + { + "name": "i32_31x128", + "dtype": np.int32, + "shape": (31, 128), + "valid_shape": (31, 128), + "eps": 0, + }, + { + "name": "i16_15x192", + "dtype": np.int16, + "shape": (15, 192), + "valid_shape": (15, 192), + "eps": 0, + }, + { + "name": "f32_7x448", + "dtype": np.float32, + "shape": (7, 448), + "valid_shape": (7, 448), + "eps": 1e-6, + }, + { + "name": "f32_256x16", + "dtype": np.float32, + "shape": (256, 16), + "valid_shape": (256, 16), + "eps": 1e-6, + }, +] + +_SMOKE_CASE_NAMES = ['f32_32x64', 'i16_15x192'] +_SMOKE_CASE_NAME_SET = set(_SMOKE_CASE_NAMES) +_missing = [name for name in _SMOKE_CASE_NAMES if name not in {case["name"] for case in CASES}] +if _missing: + raise RuntimeError("unknown smoke case(s): " + ", ".join(_missing)) +CASES = [case for case in CASES if case["name"] in _SMOKE_CASE_NAME_SET] diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmuls/compare.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmuls/compare.py new file mode 100644 index 000000000..50186777e --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmuls/compare.py @@ -0,0 +1,46 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import os +import sys +import numpy as np + +from cases import CASES +from st_common import result_cmp, style_fail, style_pass, validate_cases + +def main(): + validate_cases(CASES) + case_filter = sys.argv[1] if len(sys.argv) > 1 else None + + all_passed = True + for case in CASES: + if case_filter is not None and case["name"] != case_filter: + continue + + case_dir = case["name"] + shape = case["shape"] + vr, vc = case["valid_shape"] + + golden = np.fromfile(os.path.join(case_dir, "golden.bin"), dtype=case["dtype"]).reshape(shape) + output = np.fromfile(os.path.join(case_dir, "output.bin"), dtype=case["dtype"]).reshape(shape) + + ok = result_cmp(golden[:vr, :vc], output[:vr, :vc], case["eps"]) + if ok: + print(style_pass(f"[INFO] {case['name']}: compare passed")) + else: + print(style_fail(f"[ERROR] {case['name']}: compare failed")) + all_passed = False + + if not all_passed: + sys.exit(2) + print(style_pass("[INFO] all cases passed")) + +if __name__ == "__main__": + main() diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmuls/gen_data.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmuls/gen_data.py new file mode 100644 index 000000000..a98114643 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmuls/gen_data.py @@ -0,0 +1,35 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import numpy as np +from cases import CASES +from st_common import validate_cases, setup_case_rng, save_case_data + +# Scalar value multiplied into every element (matches the scalar passed in launch.cpp) +SCALAR = 3.0 + +validate_cases(CASES) + +for case in CASES: + setup_case_rng(case) + + dtype = case["dtype"] + shape = case["shape"] + valid_shape = case["valid_shape"] + + input1 = np.random.randint(1, 10, size=shape).astype(dtype) + + golden = np.zeros(shape, dtype=dtype) + vr, vc = valid_shape + scalar_val = dtype(SCALAR) + golden[:vr, :vc] = (input1[:vr, :vc] * scalar_val).astype(dtype, copy=False) + + save_case_data(case["name"], {"input1": input1, "golden": golden}) + print(f"[INFO] gen_data: {case['name']} shape={shape} valid_shape={valid_shape} dtype={dtype.__name__} scalar={SCALAR}") diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmuls/launch.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmuls/launch.cpp new file mode 100644 index 000000000..d6c249009 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmuls/launch.cpp @@ -0,0 +1,31 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +#include + +#ifndef AICORE +#define AICORE [aicore] +#endif + +// Scalar value multiplied into every element (must match gen_data.py SCALAR) +static constexpr float TMULS_SCALAR_F32 = 3.0f; + +// Case 0: f32 32x64 + +extern "C" __global__ AICORE void TMULS_f32_32x64(__gm__ float *src, __gm__ float *dst, float scalar); +extern "C" __global__ AICORE void TMULS_i16_15x192(__gm__ int16_t *src, __gm__ int16_t *dst, int16_t scalar); + +void LaunchTMULS_f32_32x64(float *src, float *dst, void *stream) { + TMULS_f32_32x64<<<1, nullptr, stream>>>((__gm__ float *)src, (__gm__ float *)dst, TMULS_SCALAR_F32); +} + + + +void LaunchTMULS_i16_15x192(int16_t *src, int16_t *dst, void *stream) { + TMULS_i16_15x192<<<1, nullptr, stream>>>((__gm__ int16_t *)src, (__gm__ int16_t *)dst, (int16_t)3); +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmuls/main.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmuls/main.cpp new file mode 100644 index 000000000..e5ba6457f --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmuls/main.cpp @@ -0,0 +1,133 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// Host driver for TileLang tmuls ST — case-table driven. +// tmuls: dst = src * scalar (single input + scalar). +// Numerical comparison is done externally by compare.py. + +#include "acl/acl.h" +#include "test_common.h" +#include +#include +#include +#include +#include +#include + +using namespace PtoTestCommon; + +// Kernel launch wrappers (defined in launch.cpp) +void LaunchTMULS_f32_32x64(float *src, float *dst, void *stream); +void LaunchTMULS_f16_63x64(uint16_t *src, uint16_t *dst, void *stream); +void LaunchTMULS_i16_15x192(int16_t *src, int16_t *dst, void *stream); +void LaunchTMULS_f32_256x16(float *src, float *dst, void *stream); + +struct TestCase { + const char *name; + void (*launch)(void *, void *, void *); // src, dst, stream + size_t rows; // allocated tile rows + size_t cols; // allocated tile cols + size_t validRows; // effective computation rows (<= rows) + size_t validCols; // effective computation cols (<= cols) + size_t elemSize; // bytes per element +}; + +static const TestCase kCases[] = { +{"f32_32x64", (void (*)(void*,void*,void*))LaunchTMULS_f32_32x64, 32, 64, 32, 64, sizeof(float)}, +{"i16_15x192", (void (*)(void*,void*,void*))LaunchTMULS_i16_15x192, 15, 192, 15, 192, sizeof(int16_t)}, +}; +static constexpr size_t kNumCases = sizeof(kCases) / sizeof(kCases[0]); + +static int RunCase(const TestCase &tc, int deviceId, aclrtStream stream) { + int rc = 0; + const size_t elemCount = tc.rows * tc.cols; + const size_t fileSize = elemCount * tc.elemSize; + + std::printf("[INFO] === case: %s (shape=%zux%zu, valid=%zux%zu) ===\n", + tc.name, tc.rows, tc.cols, tc.validRows, tc.validCols); + + // Per-case data directory + std::string caseDir = std::string("./") + tc.name; + size_t srcFileSize = fileSize; + + void *srcHost = nullptr, *dstHost = nullptr; + void *srcDevice = nullptr, *dstDevice = nullptr; + + aclrtMallocHost(&srcHost, fileSize); + aclrtMallocHost(&dstHost, fileSize); + + aclrtMalloc(&srcDevice, fileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc(&dstDevice, fileSize, ACL_MEM_MALLOC_HUGE_FIRST); + + if (!ReadFile((caseDir + "/input1.bin").c_str(), srcFileSize, srcHost, fileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input1.bin\n", caseDir.c_str()); + rc = 1; + } + + if (rc == 0) { + aclrtMemcpy(srcDevice, fileSize, srcHost, fileSize, ACL_MEMCPY_HOST_TO_DEVICE); + + tc.launch(srcDevice, dstDevice, stream); + + aclrtSynchronizeStream(stream); + aclrtMemcpy(dstHost, fileSize, dstDevice, fileSize, ACL_MEMCPY_DEVICE_TO_HOST); + } + + if (rc == 0 && !WriteFile((caseDir + "/output.bin").c_str(), dstHost, fileSize)) { + std::fprintf(stderr, "[ERROR] failed to write %s/output.bin\n", caseDir.c_str()); + rc = 1; + } + + if (srcDevice != nullptr) + aclrtFree(srcDevice); + if (dstDevice != nullptr) + aclrtFree(dstDevice); + if (srcHost != nullptr) + aclrtFreeHost(srcHost); + if (dstHost != nullptr) + aclrtFreeHost(dstHost); + + if (rc == 0) + std::printf("[INFO] case %s done\n", tc.name); + return rc; +} + +int main(int argc, char *argv[]) { + // Optional case filter: ./tmuls [case_name] + const char *caseFilter = (argc > 1) ? argv[1] : nullptr; + + int rc = 0; + int deviceId = 0; + aclrtStream stream = nullptr; + + aclInit(nullptr); + if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { + deviceId = std::atoi(envDevice); + } + aclrtSetDevice(deviceId); + aclrtCreateStream(&stream); + + for (size_t i = 0; i < kNumCases; ++i) { + if (caseFilter != nullptr && std::strcmp(kCases[i].name, caseFilter) != 0) { + continue; + } + int ret = RunCase(kCases[i], deviceId, stream); + if (ret != 0) { + std::fprintf(stderr, "[ERROR] case %s failed\n", kCases[i].name); + rc = 1; + break; + } + } + + if (stream != nullptr) + aclrtDestroyStream(stream); + aclrtResetDevice(deviceId); + aclFinalize(); + + return rc; +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmuls/tmuls.pto b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmuls/tmuls.pto new file mode 100644 index 000000000..95fb6a461 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tmuls/tmuls.pto @@ -0,0 +1,98 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// TileLang ST kernels for pto.tmuls: tload(src) + tmuls(src, scalar)->dst + tstore(dst). +// Multiple cases with different shapes/dtypes in a single module. +// Compiled by ptoas --enable-insert-sync --enable-tile-op-expand --pto-backend=vpto +// to produce a fatobj object. + +module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { + + // Case 0: f32 32x64 (2048 elements) + func.func @TMULS_f32_32x64(%src_ptr: !pto.ptr, %dst_ptr: !pto.ptr, %scalar: f32) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c32 = arith.constant 32 : index + %c64 = arith.constant 64 : index + %c2048 = arith.constant 2048 : index + + %src_view = pto.make_tensor_view %src_ptr, + shape = [%c1, %c1, %c1, %c32, %c64], + strides = [%c2048, %c2048, %c2048, %c64, %c1] + : !pto.tensor_view<1x1x1x32x64xf32> + %dst_view = pto.make_tensor_view %dst_ptr, + shape = [%c1, %c1, %c1, %c32, %c64], + strides = [%c2048, %c2048, %c2048, %c64, %c1] + : !pto.tensor_view<1x1x1x32x64xf32> + + %src_part = pto.partition_view %src_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c32, %c64] + : !pto.tensor_view<1x1x1x32x64xf32> -> !pto.partition_tensor_view<1x1x1x32x64xf32> + %dst_part = pto.partition_view %dst_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c32, %c64] + : !pto.tensor_view<1x1x1x32x64xf32> -> !pto.partition_tensor_view<1x1x1x32x64xf32> + + %src = pto.alloc_tile + : !pto.tile_buf + %dst = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%src_part : !pto.partition_tensor_view<1x1x1x32x64xf32>) + outs(%src : !pto.tile_buf) + pto.tmuls ins(%src, %scalar : !pto.tile_buf, f32) + outs(%dst : !pto.tile_buf) + pto.tstore ins(%dst : !pto.tile_buf) + outs(%dst_part : !pto.partition_tensor_view<1x1x1x32x64xf32>) + return + } + + // Case 1: f16 63x64 (4032 elements) + + func.func @TMULS_i16_15x192(%src_ptr: !pto.ptr, %dst_ptr: !pto.ptr, %scalar: i16) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c15 = arith.constant 15 : index + %c192 = arith.constant 192 : index + %c2880 = arith.constant 2880 : index + + %src_view = pto.make_tensor_view %src_ptr, + shape = [%c1, %c1, %c1, %c15, %c192], + strides = [%c2880, %c2880, %c2880, %c192, %c1] + : !pto.tensor_view<1x1x1x15x192xi16> + %dst_view = pto.make_tensor_view %dst_ptr, + shape = [%c1, %c1, %c1, %c15, %c192], + strides = [%c2880, %c2880, %c2880, %c192, %c1] + : !pto.tensor_view<1x1x1x15x192xi16> + + %src_part = pto.partition_view %src_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c15, %c192] + : !pto.tensor_view<1x1x1x15x192xi16> -> !pto.partition_tensor_view<1x1x1x15x192xi16> + %dst_part = pto.partition_view %dst_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c15, %c192] + : !pto.tensor_view<1x1x1x15x192xi16> -> !pto.partition_tensor_view<1x1x1x15x192xi16> + + %src = pto.alloc_tile + : !pto.tile_buf + %dst = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%src_part : !pto.partition_tensor_view<1x1x1x15x192xi16>) + outs(%src : !pto.tile_buf) + pto.tmuls ins(%src, %scalar : !pto.tile_buf, i16) + outs(%dst : !pto.tile_buf) + pto.tstore ins(%dst : !pto.tile_buf) + outs(%dst_part : !pto.partition_tensor_view<1x1x1x15x192xi16>) + return + } + + // Case 4: f32 7x448 (3136 elements) +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tneg/CMakeLists.txt b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tneg/CMakeLists.txt new file mode 100644 index 000000000..02a068e9e --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tneg/CMakeLists.txt @@ -0,0 +1,9 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +pto_tilelang_vec_st(tneg) diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tneg/cases.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tneg/cases.py new file mode 100644 index 000000000..6dfbf2099 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tneg/cases.py @@ -0,0 +1,76 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +"""Single source of truth for tneg ST test cases. + +Each case defines: + - name: case identifier, used as subdirectory name and by main.cpp kCases[]. + - dtype: numpy dtype (e.g. np.float32). + - shape: (rows, cols) — allocated tile dimensions. + - valid_shape: (valid_rows, valid_cols) — effective computation region. + - eps: tolerance for numpy.allclose (atol and rtol). + +gen_data.py and compare.py both import this list to avoid redundant definitions. +""" + +import numpy as np + +CASES = [ + { + "name": "f32_16x64", + "dtype": np.float32, + "shape": (16, 64), + "valid_shape": (16, 64), + "eps": 1e-6, + }, + { + "name": "f32_32x32", + "dtype": np.float32, + "shape": (32, 32), + "valid_shape": (32, 32), + "eps": 1e-6, + }, + { + "name": "f16_16x64", + "dtype": np.float16, + "shape": (16, 64), + "valid_shape": (16, 64), + "eps": 1e-3, + }, + { + "name": "f16_32x32", + "dtype": np.float16, + "shape": (32, 32), + "valid_shape": (32, 32), + "eps": 1e-3, + }, + { + "name": "i32_32x32", + "dtype": np.int32, + "shape": (32, 32), + "valid_shape": (32, 32), + "eps": 0, + }, + { + "name": "i16_64x16", + "dtype": np.int16, + "shape": (64, 16), + "valid_shape": (64, 16), + "eps": 0, + }, +] + +_SMOKE_CASE_NAMES = ['f32_16x64', 'f16_16x64'] +_SMOKE_CASE_NAME_SET = set(_SMOKE_CASE_NAMES) +_missing = [name for name in _SMOKE_CASE_NAMES if name not in {case["name"] for case in CASES}] +if _missing: + raise RuntimeError("unknown smoke case(s): " + ", ".join(_missing)) +CASES = [case for case in CASES if case["name"] in _SMOKE_CASE_NAME_SET] diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tneg/compare.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tneg/compare.py new file mode 100644 index 000000000..428604929 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tneg/compare.py @@ -0,0 +1,49 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import os +import sys +import numpy as np + +from cases import CASES +from st_common import result_cmp, style_fail, style_pass, validate_cases + + +def main(): + validate_cases(CASES) + case_filter = sys.argv[1] if len(sys.argv) > 1 else None + + all_passed = True + for case in CASES: + if case_filter is not None and case["name"] != case_filter: + continue + + case_dir = case["name"] + shape = case["shape"] + vr, vc = case["valid_shape"] + + golden = np.fromfile(os.path.join(case_dir, "golden.bin"), dtype=case["dtype"]).reshape(shape) + output = np.fromfile(os.path.join(case_dir, "output.bin"), dtype=case["dtype"]).reshape(shape) + + ok = result_cmp(golden[:vr, :vc], output[:vr, :vc], case["eps"]) + if ok: + print(style_pass(f"[INFO] {case['name']}: compare passed")) + else: + print(style_fail(f"[ERROR] {case['name']}: compare failed")) + all_passed = False + + if not all_passed: + sys.exit(2) + print(style_pass("[INFO] all cases passed")) + + +if __name__ == "__main__": + main() diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tneg/gen_data.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tneg/gen_data.py new file mode 100644 index 000000000..0c88055b7 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tneg/gen_data.py @@ -0,0 +1,33 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import numpy as np +from cases import CASES +from st_common import validate_cases, setup_case_rng, save_case_data + +validate_cases(CASES) + +for case in CASES: + setup_case_rng(case) + + dtype = case["dtype"] + shape = case["shape"] + valid_shape = case["valid_shape"] + + # Random values (no constraints for neg) + input = np.random.randn(*shape).astype(dtype) + + golden = np.zeros(shape, dtype=dtype) + vr, vc = valid_shape + golden[:vr, :vc] = np.negative(input[:vr, :vc]).astype(dtype, copy=False) + + save_case_data(case["name"], {"input": input, "golden": golden}) + print(f"[INFO] gen_data: {case['name']} shape={shape} valid_shape={valid_shape} dtype={dtype.__name__}") diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tneg/launch.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tneg/launch.cpp new file mode 100644 index 000000000..dfc7ea52d --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tneg/launch.cpp @@ -0,0 +1,28 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +#include + +#ifndef AICORE +#define AICORE [aicore] +#endif + +// Case 0: f32 16x64 + +extern "C" __global__ AICORE void TNEG_f32_16x64(__gm__ float *a, __gm__ float *b); +extern "C" __global__ AICORE void TNEG_f16_16x64(__gm__ uint16_t *a, __gm__ uint16_t *b); + +void LaunchTNEG_f32_16x64(void *a, void *b, void *stream) { + TNEG_f32_16x64<<<1, nullptr, stream>>>((__gm__ float *)a, (__gm__ float *)b); +} + + + +void LaunchTNEG_f16_16x64(void *a, void *b, void *stream) { + TNEG_f16_16x64<<<1, nullptr, stream>>>((__gm__ uint16_t *)a, (__gm__ uint16_t *)b); +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tneg/main.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tneg/main.cpp new file mode 100644 index 000000000..d5334e590 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tneg/main.cpp @@ -0,0 +1,136 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// Host driver for TileLang tneg ST — case-table driven. +// Each case launches a different kernel variant, reads/writes from per-case subdirectory. +// Numerical comparison is done externally by compare.py. + +#include "acl/acl.h" +#include "test_common.h" +#include +#include +#include +#include +#include +#include + +using namespace PtoTestCommon; + +// Kernel launch wrappers (defined in launch.cpp) +void LaunchTNEG_f32_16x64(void *a, void *b, void *stream); +void LaunchTNEG_f32_32x32(void *a, void *b, void *stream); +void LaunchTNEG_f16_16x64(void *a, void *b, void *stream); +void LaunchTNEG_f16_32x32(void *a, void *b, void *stream); +void LaunchTNEG_i16_64x16(void *a, void *b, void *stream); + +using LaunchFn = void (*)(void *, void *, void *); + +struct TestCase { + const char *name; + LaunchFn launch; + size_t rows; // allocated tile rows + size_t cols; // allocated tile cols + size_t validRows; // effective computation rows (<= rows) + size_t validCols; // effective computation cols (<= cols) + size_t elemSize; // bytes per element +}; + +static const TestCase kCases[] = { +{"f32_16x64", LaunchTNEG_f32_16x64, 16, 64, 16, 64, sizeof(float)}, +{"f16_16x64", LaunchTNEG_f16_16x64, 16, 64, 16, 64, sizeof(uint16_t)}, +}; +static constexpr size_t kNumCases = sizeof(kCases) / sizeof(kCases[0]); + +static int RunCase(const TestCase &tc, int deviceId, aclrtStream stream) { + int rc = 0; + const size_t elemCount = tc.rows * tc.cols; + const size_t fileSize = elemCount * tc.elemSize; + + std::printf("[INFO] === case: %s (shape=%zux%zu, valid=%zux%zu) ===\n", + tc.name, tc.rows, tc.cols, tc.validRows, tc.validCols); + + // Per-case data directory + std::string caseDir = std::string("./") + tc.name; + size_t srcFileSize = fileSize; + + void *srcHost = nullptr, *dstHost = nullptr; + void *srcDevice = nullptr, *dstDevice = nullptr; + + aclrtMallocHost((void **)(&srcHost), fileSize); + aclrtMallocHost((void **)(&dstHost), fileSize); + + aclrtMalloc((void **)&srcDevice, fileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc((void **)&dstDevice, fileSize, ACL_MEM_MALLOC_HUGE_FIRST); + + if (!ReadFile((caseDir + "/input.bin").c_str(), srcFileSize, srcHost, fileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input.bin\n", caseDir.c_str()); + rc = 1; + } + + if (rc == 0) { + aclrtMemcpy(srcDevice, fileSize, srcHost, fileSize, ACL_MEMCPY_HOST_TO_DEVICE); + + tc.launch(srcDevice, dstDevice, stream); + + aclrtSynchronizeStream(stream); + aclrtMemcpy(dstHost, fileSize, dstDevice, fileSize, ACL_MEMCPY_DEVICE_TO_HOST); + } + + if (rc == 0 && !WriteFile((caseDir + "/output.bin").c_str(), dstHost, fileSize)) { + std::fprintf(stderr, "[ERROR] failed to write %s/output.bin\n", caseDir.c_str()); + rc = 1; + } + + if (srcDevice != nullptr) + aclrtFree(srcDevice); + if (dstDevice != nullptr) + aclrtFree(dstDevice); + if (srcHost != nullptr) + aclrtFreeHost(srcHost); + if (dstHost != nullptr) + aclrtFreeHost(dstHost); + + if (rc == 0) + std::printf("[INFO] case %s done\n", tc.name); + return rc; +} + +int main(int argc, char *argv[]) { + // Optional case filter: ./tneg [case_name] + const char *caseFilter = (argc > 1) ? argv[1] : nullptr; + + int rc = 0; + int deviceId = 0; + aclrtStream stream = nullptr; + + aclInit(nullptr); + if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { + deviceId = std::atoi(envDevice); + } + aclrtSetDevice(deviceId); + aclrtCreateStream(&stream); + + for (size_t i = 0; i < kNumCases; ++i) { + if (caseFilter != nullptr && std::strcmp(kCases[i].name, caseFilter) != 0) { + continue; + } + int ret = RunCase(kCases[i], deviceId, stream); + if (ret != 0) { + std::fprintf(stderr, "[ERROR] case %s failed\n", kCases[i].name); + rc = 1; + break; + } + } + + if (stream != nullptr) + aclrtDestroyStream(stream); + aclrtResetDevice(deviceId); + aclFinalize(); + + return rc; +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tneg/tneg.pto b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tneg/tneg.pto new file mode 100644 index 000000000..bf4d08264 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tneg/tneg.pto @@ -0,0 +1,101 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// TileLang ST kernels for pto.tneg: tload(a) + tneg(a)->b + tstore(b). +// Multiple cases with different shapes in a single module. +// Compiled by ptoas --enable-insert-sync --enable-tile-op-expand --pto-backend=vpto +// to produce a fatobj object. + +module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { + // Case 0: f32 16x64 (1024 elements) + func.func @TNEG_f32_16x64(%a_ptr: !pto.ptr, %b_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c16 = arith.constant 16 : index + %c64 = arith.constant 64 : index + %c1024 = arith.constant 1024 : index + + %a_view = pto.make_tensor_view %a_ptr, + shape = [%c1, %c1, %c1, %c16, %c64], + strides = [%c1024, %c1024, %c1024, %c64, %c1] + : !pto.tensor_view<1x1x1x16x64xf32> + %b_view = pto.make_tensor_view %b_ptr, + shape = [%c1, %c1, %c1, %c16, %c64], + strides = [%c1024, %c1024, %c1024, %c64, %c1] + : !pto.tensor_view<1x1x1x16x64xf32> + + %a_part = pto.partition_view %a_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c16, %c64] + : !pto.tensor_view<1x1x1x16x64xf32> -> !pto.partition_tensor_view<1x1x1x16x64xf32> + %b_part = pto.partition_view %b_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c16, %c64] + : !pto.tensor_view<1x1x1x16x64xf32> -> !pto.partition_tensor_view<1x1x1x16x64xf32> + + %a = pto.alloc_tile + : !pto.tile_buf + %b = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%a_part : !pto.partition_tensor_view<1x1x1x16x64xf32>) + outs(%a : !pto.tile_buf) + + pto.tneg ins(%a : !pto.tile_buf) + outs(%b : !pto.tile_buf) + + pto.tstore ins(%b : !pto.tile_buf) + outs(%b_part : !pto.partition_tensor_view<1x1x1x16x64xf32>) + return + } + + // Case 1: f32 32x32 (1024 elements) + + func.func @TNEG_f16_16x64(%a_ptr: !pto.ptr, %b_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c16 = arith.constant 16 : index + %c64 = arith.constant 64 : index + %c1024 = arith.constant 1024 : index + + %a_view = pto.make_tensor_view %a_ptr, + shape = [%c1, %c1, %c1, %c16, %c64], + strides = [%c1024, %c1024, %c1024, %c64, %c1] + : !pto.tensor_view<1x1x1x16x64xf16> + %b_view = pto.make_tensor_view %b_ptr, + shape = [%c1, %c1, %c1, %c16, %c64], + strides = [%c1024, %c1024, %c1024, %c64, %c1] + : !pto.tensor_view<1x1x1x16x64xf16> + + %a_part = pto.partition_view %a_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c16, %c64] + : !pto.tensor_view<1x1x1x16x64xf16> -> !pto.partition_tensor_view<1x1x1x16x64xf16> + %b_part = pto.partition_view %b_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c16, %c64] + : !pto.tensor_view<1x1x1x16x64xf16> -> !pto.partition_tensor_view<1x1x1x16x64xf16> + + %a = pto.alloc_tile + : !pto.tile_buf + %b = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%a_part : !pto.partition_tensor_view<1x1x1x16x64xf16>) + outs(%a : !pto.tile_buf) + + pto.tneg ins(%a : !pto.tile_buf) + outs(%b : !pto.tile_buf) + + pto.tstore ins(%b : !pto.tile_buf) + outs(%b_part : !pto.partition_tensor_view<1x1x1x16x64xf16>) + return + } + + // Case 3: f16 32x32 (1024 elements) +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tnot/CMakeLists.txt b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tnot/CMakeLists.txt new file mode 100644 index 000000000..ee5525ac2 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tnot/CMakeLists.txt @@ -0,0 +1,9 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +pto_tilelang_vec_st(tnot) diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tnot/cases.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tnot/cases.py new file mode 100644 index 000000000..b20c24e44 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tnot/cases.py @@ -0,0 +1,76 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +"""Single source of truth for tnot ST test cases. + +Each case defines: + - name: case identifier, used as subdirectory name and by main.cpp kCases[]. + - dtype: numpy dtype (e.g. np.int8, np.int16, np.int32). + - shape: (rows, cols) — allocated tile dimensions. + - valid_shape: (valid_rows, valid_cols) — effective computation region. + - eps: tolerance for numpy.allclose (atol and rtol), 0 for exact match. + +gen_data.py and compare.py both import this list to avoid redundant definitions. +""" + +import numpy as np + +CASES = [ + { + "name": "int8_64x64", + "dtype": np.int8, + "shape": (64, 64), + "valid_shape": (64, 64), + "eps": 0 + }, + { + "name": "uint8_60x60", + "dtype": np.uint8, + "shape": (64, 64), + "valid_shape": (60, 60), + "eps": 0 + }, + { + "name": "int16_64x64", + "dtype": np.int16, + "shape": (64, 64), + "valid_shape": (64, 64), + "eps": 0 + }, + { + "name": "uint16_60x60", + "dtype": np.uint16, + "shape": (64, 64), + "valid_shape": (60, 60), + "eps": 0 + }, + { + "name": "int32_64x64", + "dtype": np.int32, + "shape": (64, 64), + "valid_shape": (64, 64), + "eps": 0 + }, + { + "name": "uint32_60x60", + "dtype": np.uint32, + "shape": (64, 64), + "valid_shape": (60, 60), + "eps": 0 + }, +] + +_SMOKE_CASE_NAMES = ['int8_64x64', 'uint8_60x60'] +_SMOKE_CASE_NAME_SET = set(_SMOKE_CASE_NAMES) +_missing = [name for name in _SMOKE_CASE_NAMES if name not in {case["name"] for case in CASES}] +if _missing: + raise RuntimeError("unknown smoke case(s): " + ", ".join(_missing)) +CASES = [case for case in CASES if case["name"] in _SMOKE_CASE_NAME_SET] diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tnot/compare.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tnot/compare.py new file mode 100644 index 000000000..428604929 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tnot/compare.py @@ -0,0 +1,49 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import os +import sys +import numpy as np + +from cases import CASES +from st_common import result_cmp, style_fail, style_pass, validate_cases + + +def main(): + validate_cases(CASES) + case_filter = sys.argv[1] if len(sys.argv) > 1 else None + + all_passed = True + for case in CASES: + if case_filter is not None and case["name"] != case_filter: + continue + + case_dir = case["name"] + shape = case["shape"] + vr, vc = case["valid_shape"] + + golden = np.fromfile(os.path.join(case_dir, "golden.bin"), dtype=case["dtype"]).reshape(shape) + output = np.fromfile(os.path.join(case_dir, "output.bin"), dtype=case["dtype"]).reshape(shape) + + ok = result_cmp(golden[:vr, :vc], output[:vr, :vc], case["eps"]) + if ok: + print(style_pass(f"[INFO] {case['name']}: compare passed")) + else: + print(style_fail(f"[ERROR] {case['name']}: compare failed")) + all_passed = False + + if not all_passed: + sys.exit(2) + print(style_pass("[INFO] all cases passed")) + + +if __name__ == "__main__": + main() diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tnot/gen_data.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tnot/gen_data.py new file mode 100644 index 000000000..62de58386 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tnot/gen_data.py @@ -0,0 +1,30 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import numpy as np +from cases import CASES +from st_common import validate_cases, setup_case_rng, save_case_data + +validate_cases(CASES) + +for case in CASES: + setup_case_rng(case) + + dtype = case["dtype"] + shape = case["shape"] + valid_shape = case["valid_shape"] + + dtype_info = np.iinfo(dtype) + input = np.random.randint(dtype_info.min, dtype_info.max, size=shape, dtype=dtype) + golden = np.bitwise_not(input).astype(dtype, copy=False) + + save_case_data(case["name"], {"input": input, "golden": golden}) + print(f"[INFO] gen_data: {case['name']} shape={shape} valid_shape={valid_shape} dtype={dtype.__name__}") diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tnot/launch.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tnot/launch.cpp new file mode 100644 index 000000000..5673982e9 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tnot/launch.cpp @@ -0,0 +1,28 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +#include + +#ifndef AICORE +#define AICORE [aicore] +#endif + +// Case 0: int8 64x64 + +extern "C" __global__ AICORE void TNOT_int8_64x64(__gm__ int8_t *a, __gm__ int8_t *b); +extern "C" __global__ AICORE void TNOT_uint8_60x60(__gm__ uint8_t *a, __gm__ uint8_t *b); + +void LaunchTNOT_uint8_60x60(void *a, void *b, void *stream) { + TNOT_uint8_60x60<<<1, nullptr, stream>>>((__gm__ uint8_t *)a, (__gm__ uint8_t *)b); +} + + + +void LaunchTNOT_int8_64x64(void *a, void *b, void *stream) { + TNOT_int8_64x64<<<1, nullptr, stream>>>((__gm__ int8_t *)a, (__gm__ int8_t *)b); +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tnot/main.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tnot/main.cpp new file mode 100644 index 000000000..0fd8c5c6c --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tnot/main.cpp @@ -0,0 +1,135 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// Host driver for TileLang tnot ST — case-table driven. +// Each case launches a different kernel variant, reads/writes from per-case subdirectory. +// Numerical comparison is done externally by compare.py. + +#include "acl/acl.h" +#include "test_common.h" +#include +#include +#include +#include +#include +#include + +using namespace PtoTestCommon; + +// Kernel launch wrappers (defined in launch.cpp) +void LaunchTNOT_int8_64x64(void *a, void *b, void *stream); +void LaunchTNOT_uint8_60x60(void *a, void *b, void *stream); +void LaunchTNOT_uint16_60x60(void *a, void *b, void *stream); +void LaunchTNOT_uint32_60x60(void *a, void *b, void *stream); + +using LaunchFn = void (*)(void *, void *, void *); + +struct TestCase { + const char *name; + LaunchFn launch; + size_t rows; // allocated tile rows + size_t cols; // allocated tile cols + size_t validRows; // effective computation rows (<= rows) + size_t validCols; // effective computation cols (<= cols) + size_t elemSize; // bytes per element +}; + +static const TestCase kCases[] = { +{"int8_64x64", LaunchTNOT_int8_64x64, 64, 64, 64, 64, sizeof(int8_t)}, +{"uint8_60x60", LaunchTNOT_uint8_60x60, 64, 64, 60, 60, sizeof(uint8_t)}, +}; +static constexpr size_t kNumCases = sizeof(kCases) / sizeof(kCases[0]); + +static int RunCase(const TestCase &tc, int deviceId, aclrtStream stream) { + int rc = 0; + const size_t elemCount = tc.rows * tc.cols; + const size_t fileSize = elemCount * tc.elemSize; + + std::printf("[INFO] === case: %s (shape=%zux%zu, valid=%zux%zu) ===\n", + tc.name, tc.rows, tc.cols, tc.validRows, tc.validCols); + + // Per-case data directory + std::string caseDir = std::string("./") + tc.name; + size_t srcFileSize = fileSize; + + void *srcHost = nullptr, *dstHost = nullptr; + void *srcDevice = nullptr, *dstDevice = nullptr; + + aclrtMallocHost((void **)(&srcHost), fileSize); + aclrtMallocHost((void **)(&dstHost), fileSize); + + aclrtMalloc((void **)&srcDevice, fileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc((void **)&dstDevice, fileSize, ACL_MEM_MALLOC_HUGE_FIRST); + + if (!ReadFile((caseDir + "/input.bin").c_str(), srcFileSize, srcHost, fileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input.bin\n", caseDir.c_str()); + rc = 1; + } + + if (rc == 0) { + aclrtMemcpy(srcDevice, fileSize, srcHost, fileSize, ACL_MEMCPY_HOST_TO_DEVICE); + + tc.launch(srcDevice, dstDevice, stream); + + aclrtSynchronizeStream(stream); + aclrtMemcpy(dstHost, fileSize, dstDevice, fileSize, ACL_MEMCPY_DEVICE_TO_HOST); + } + + if (rc == 0 && !WriteFile((caseDir + "/output.bin").c_str(), dstHost, fileSize)) { + std::fprintf(stderr, "[ERROR] failed to write %s/output.bin\n", caseDir.c_str()); + rc = 1; + } + + if (srcDevice != nullptr) + aclrtFree(srcDevice); + if (dstDevice != nullptr) + aclrtFree(dstDevice); + if (srcHost != nullptr) + aclrtFreeHost(srcHost); + if (dstHost != nullptr) + aclrtFreeHost(dstHost); + + if (rc == 0) + std::printf("[INFO] case %s done\n", tc.name); + return rc; +} + +int main(int argc, char *argv[]) { + // Optional case filter: ./tnot [case_name] + const char *caseFilter = (argc > 1) ? argv[1] : nullptr; + + int rc = 0; + int deviceId = 0; + aclrtStream stream = nullptr; + + aclInit(nullptr); + if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { + deviceId = std::atoi(envDevice); + } + aclrtSetDevice(deviceId); + aclrtCreateStream(&stream); + + for (size_t i = 0; i < kNumCases; ++i) { + if (caseFilter != nullptr && std::strcmp(kCases[i].name, caseFilter) != 0) { + continue; + } + int ret = RunCase(kCases[i], deviceId, stream); + if (ret != 0) { + std::fprintf(stderr, "[ERROR] case %s failed\n", kCases[i].name); + rc = 1; + break; + } + } + + if (stream != nullptr) + aclrtDestroyStream(stream); + aclrtResetDevice(deviceId); + aclFinalize(); + + return rc; +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tnot/tnot.pto b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tnot/tnot.pto new file mode 100644 index 000000000..dfd1c129e --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tnot/tnot.pto @@ -0,0 +1,100 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// TileLang ST kernels for pto.tnot: tload(a) + tnot(a)->b + tstore(b). +// Multiple cases with different shapes in a single module. +// Compiled by ptoas --enable-insert-sync --enable-tile-op-expand --pto-backend=vpto +// to produce a fatobj object. + +module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { + // Case 0: int8 64x64 (valid 64x64) + func.func @TNOT_int8_64x64(%a_ptr: !pto.ptr, %b_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c64 = arith.constant 64 : index + %c4096 = arith.constant 4096 : index + + %a_view = pto.make_tensor_view %a_ptr, + shape = [%c1, %c1, %c1, %c64, %c64], + strides = [%c4096, %c4096, %c4096, %c64, %c1] + : !pto.tensor_view<1x1x1x64x64xi8> + %b_view = pto.make_tensor_view %b_ptr, + shape = [%c1, %c1, %c1, %c64, %c64], + strides = [%c4096, %c4096, %c4096, %c64, %c1] + : !pto.tensor_view<1x1x1x64x64xi8> + + %a_part = pto.partition_view %a_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c64, %c64] + : !pto.tensor_view<1x1x1x64x64xi8> -> !pto.partition_tensor_view<1x1x1x64x64xi8> + %b_part = pto.partition_view %b_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c64, %c64] + : !pto.tensor_view<1x1x1x64x64xi8> -> !pto.partition_tensor_view<1x1x1x64x64xi8> + + %a = pto.alloc_tile + : !pto.tile_buf + %b = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%a_part : !pto.partition_tensor_view<1x1x1x64x64xi8>) + outs(%a : !pto.tile_buf) + + pto.tnot ins(%a : !pto.tile_buf) + outs(%b : !pto.tile_buf) + + pto.tstore ins(%b : !pto.tile_buf) + outs(%b_part : !pto.partition_tensor_view<1x1x1x64x64xi8>) + return + } + + // Case 1: uint8 64x64 (valid 60x60) - partition_view sizes = valid_shape + + func.func @TNOT_uint8_60x60(%a_ptr: !pto.ptr, %b_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c60 = arith.constant 60 : index + %c64 = arith.constant 64 : index + %c4096 = arith.constant 4096 : index + + %a_view = pto.make_tensor_view %a_ptr, + shape = [%c1, %c1, %c1, %c64, %c64], + strides = [%c4096, %c4096, %c4096, %c64, %c1] + : !pto.tensor_view<1x1x1x64x64xui8> + %b_view = pto.make_tensor_view %b_ptr, + shape = [%c1, %c1, %c1, %c64, %c64], + strides = [%c4096, %c4096, %c4096, %c64, %c1] + : !pto.tensor_view<1x1x1x64x64xui8> + + %a_part = pto.partition_view %a_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c60, %c60] + : !pto.tensor_view<1x1x1x64x64xui8> -> !pto.partition_tensor_view<1x1x1x60x60xui8> + %b_part = pto.partition_view %b_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c60, %c60] + : !pto.tensor_view<1x1x1x64x64xui8> -> !pto.partition_tensor_view<1x1x1x60x60xui8> + + %a = pto.alloc_tile + : !pto.tile_buf + %b = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%a_part : !pto.partition_tensor_view<1x1x1x60x60xui8>) + outs(%a : !pto.tile_buf) + + pto.tnot ins(%a : !pto.tile_buf) + outs(%b : !pto.tile_buf) + + pto.tstore ins(%b : !pto.tile_buf) + outs(%b_part : !pto.partition_tensor_view<1x1x1x60x60xui8>) + return + } + + // Case 2: int16 64x64 (valid 64x64) +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tor/CMakeLists.txt b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tor/CMakeLists.txt new file mode 100644 index 000000000..9f356f661 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tor/CMakeLists.txt @@ -0,0 +1,9 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +pto_tilelang_vec_st(tor) diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tor/cases.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tor/cases.py new file mode 100644 index 000000000..902435721 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tor/cases.py @@ -0,0 +1,47 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +"""Single source of truth for tor ST test cases. + +Each case defines: + - name: case identifier, used as subdirectory name and by main.cpp kCases[]. + - dtype: numpy dtype (e.g. np.int32). + - shape: (rows, cols) — allocated tile dimensions. + - valid_shape: (valid_rows, valid_cols) — effective computation region. + - eps: tolerance for numpy.allclose (atol and rtol). + +gen_data.py and compare.py both import this list to avoid redundant definitions. +""" + +import numpy as np + +CASES = [ + { + "name": "i32_16x64", + "dtype": np.int32, + "shape": (16, 64), + "valid_shape": (16, 64), + "eps": 0, + }, + { + "name": "i32_32x32", + "dtype": np.int32, + "shape": (32, 32), + "valid_shape": (32, 32), + "eps": 0, + }, +] + +_SMOKE_CASE_NAMES = ['i32_16x64', 'i32_32x32'] +_SMOKE_CASE_NAME_SET = set(_SMOKE_CASE_NAMES) +_missing = [name for name in _SMOKE_CASE_NAMES if name not in {case["name"] for case in CASES}] +if _missing: + raise RuntimeError("unknown smoke case(s): " + ", ".join(_missing)) +CASES = [case for case in CASES if case["name"] in _SMOKE_CASE_NAME_SET] diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tor/compare.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tor/compare.py new file mode 100644 index 000000000..6a4d5d1aa --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tor/compare.py @@ -0,0 +1,48 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import os +import sys +import numpy as np + +from cases import CASES +from st_common import result_cmp, style_fail, style_pass, validate_cases + + +def main(): + validate_cases(CASES) + case_filter = sys.argv[1] if len(sys.argv) > 1 else None + + all_passed = True + for case in CASES: + if case_filter is not None and case["name"] != case_filter: + continue + + case_dir = case["name"] + shape = case["shape"] + vr, vc = case["valid_shape"] + + golden = np.fromfile(os.path.join(case_dir, "golden.bin"), dtype=case["dtype"]).reshape(shape) + output = np.fromfile(os.path.join(case_dir, "output.bin"), dtype=case["dtype"]).reshape(shape) + + ok = result_cmp(golden[:vr, :vc], output[:vr, :vc], case["eps"]) + if ok: + print(style_pass(f"[INFO] {case['name']}: compare passed")) + else: + print(style_fail(f"[ERROR] {case['name']}: compare failed")) + all_passed = False + + if not all_passed: + sys.exit(2) + print(style_pass("[INFO] all cases passed")) + + +if __name__ == "__main__": + main() diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tor/gen_data.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tor/gen_data.py new file mode 100644 index 000000000..b9f083df2 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tor/gen_data.py @@ -0,0 +1,32 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import numpy as np +from cases import CASES +from st_common import validate_cases, setup_case_rng, save_case_data + +validate_cases(CASES) + +for case in CASES: + setup_case_rng(case) + + dtype = case["dtype"] + shape = case["shape"] + valid_shape = case["valid_shape"] + + input1 = np.random.randint(0, 100, size=shape).astype(dtype) + input2 = np.random.randint(0, 100, size=shape).astype(dtype) + + golden = np.zeros(shape, dtype=dtype) + vr, vc = valid_shape + golden[:vr, :vc] = (input1[:vr, :vc] | input2[:vr, :vc]).astype(dtype, copy=False) + + save_case_data(case["name"], {"input1": input1, "input2": input2, "golden": golden}) + print(f"[INFO] gen_data: {case['name']} shape={shape} valid_shape={valid_shape} dtype={dtype.__name__}") diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tor/launch.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tor/launch.cpp new file mode 100644 index 000000000..a8e2d97c4 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tor/launch.cpp @@ -0,0 +1,28 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +#include + +#ifndef AICORE +#define AICORE [aicore] +#endif + +// Case 0: i32 16x64 + +extern "C" __global__ AICORE void TOR_i32_16x64(__gm__ int32_t *a, __gm__ int32_t *b, __gm__ int32_t *c); +extern "C" __global__ AICORE void TOR_i32_32x32(__gm__ int32_t *a, __gm__ int32_t *b, __gm__ int32_t *c); + +void LaunchTOR_i32_16x64(int32_t *a, int32_t *b, int32_t *c, void *stream) { + TOR_i32_16x64<<<1, nullptr, stream>>>((__gm__ int32_t *)a, (__gm__ int32_t *)b, (__gm__ int32_t *)c); +} + + + +void LaunchTOR_i32_32x32(int32_t *a, int32_t *b, int32_t *c, void *stream) { + TOR_i32_32x32<<<1, nullptr, stream>>>((__gm__ int32_t *)a, (__gm__ int32_t *)b, (__gm__ int32_t *)c); +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tor/main.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tor/main.cpp new file mode 100644 index 000000000..f7aec545d --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tor/main.cpp @@ -0,0 +1,145 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// Host driver for TileLang tor ST — case-table driven. +// Each case launches a different kernel variant, reads/writes from per-case subdirectory. +// Numerical comparison is done externally by compare.py. + +#include "acl/acl.h" +#include "test_common.h" +#include +#include +#include +#include +#include +#include + +using namespace PtoTestCommon; + +// Kernel launch wrappers (defined in launch.cpp) +void LaunchTOR_i32_16x64(int32_t *a, int32_t *b, int32_t *c, void *stream); +void LaunchTOR_i32_32x32(int32_t *a, int32_t *b, int32_t *c, void *stream); + +using LaunchFn = void (*)(int32_t *, int32_t *, int32_t *, void *); + +struct TestCase { + const char *name; + LaunchFn launch; + size_t rows; // allocated tile rows + size_t cols; // allocated tile cols + size_t validRows; // effective computation rows (<= rows) + size_t validCols; // effective computation cols (<= cols) + size_t elemSize; // bytes per element +}; + +static const TestCase kCases[] = { +{"i32_16x64", LaunchTOR_i32_16x64, 16, 64, 16, 64, sizeof(int32_t)}, +{"i32_32x32", LaunchTOR_i32_32x32, 32, 32, 32, 32, sizeof(int32_t)}, +}; +static constexpr size_t kNumCases = sizeof(kCases) / sizeof(kCases[0]); + +static int RunCase(const TestCase &tc, int deviceId, aclrtStream stream) { + int rc = 0; + const size_t elemCount = tc.rows * tc.cols; + const size_t fileSize = elemCount * tc.elemSize; + + std::printf("[INFO] === case: %s (shape=%zux%zu, valid=%zux%zu) ===\n", + tc.name, tc.rows, tc.cols, tc.validRows, tc.validCols); + + // Per-case data directory + std::string caseDir = std::string("./") + tc.name; + size_t src0FileSize = fileSize; + size_t src1FileSize = fileSize; + + int32_t *src0Host = nullptr, *src1Host = nullptr, *dstHost = nullptr; + int32_t *src0Device = nullptr, *src1Device = nullptr, *dstDevice = nullptr; + + aclrtMallocHost((void **)(&src0Host), fileSize); + aclrtMallocHost((void **)(&src1Host), fileSize); + aclrtMallocHost((void **)(&dstHost), fileSize); + + aclrtMalloc((void **)&src0Device, fileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc((void **)&src1Device, fileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc((void **)&dstDevice, fileSize, ACL_MEM_MALLOC_HUGE_FIRST); + + if (!ReadFile((caseDir + "/input1.bin").c_str(), src0FileSize, src0Host, fileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input1.bin\n", caseDir.c_str()); + rc = 1; + } + if (rc == 0 && !ReadFile((caseDir + "/input2.bin").c_str(), src1FileSize, src1Host, fileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input2.bin\n", caseDir.c_str()); + rc = 1; + } + + if (rc == 0) { + aclrtMemcpy(src0Device, fileSize, src0Host, fileSize, ACL_MEMCPY_HOST_TO_DEVICE); + aclrtMemcpy(src1Device, fileSize, src1Host, fileSize, ACL_MEMCPY_HOST_TO_DEVICE); + + tc.launch(src0Device, src1Device, dstDevice, stream); + + aclrtSynchronizeStream(stream); + aclrtMemcpy(dstHost, fileSize, dstDevice, fileSize, ACL_MEMCPY_DEVICE_TO_HOST); + } + + if (rc == 0 && !WriteFile((caseDir + "/output.bin").c_str(), dstHost, fileSize)) { + std::fprintf(stderr, "[ERROR] failed to write %s/output.bin\n", caseDir.c_str()); + rc = 1; + } + + if (src0Device != nullptr) + aclrtFree(src0Device); + if (src1Device != nullptr) + aclrtFree(src1Device); + if (dstDevice != nullptr) + aclrtFree(dstDevice); + if (src0Host != nullptr) + aclrtFreeHost(src0Host); + if (src1Host != nullptr) + aclrtFreeHost(src1Host); + if (dstHost != nullptr) + aclrtFreeHost(dstHost); + + if (rc == 0) + std::printf("[INFO] case %s done\n", tc.name); + return rc; +} + +int main(int argc, char *argv[]) { + // Optional case filter: ./tor [case_name] + const char *caseFilter = (argc > 1) ? argv[1] : nullptr; + + int rc = 0; + int deviceId = 0; + aclrtStream stream = nullptr; + + aclInit(nullptr); + if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { + deviceId = std::atoi(envDevice); + } + aclrtSetDevice(deviceId); + aclrtCreateStream(&stream); + + for (size_t i = 0; i < kNumCases; ++i) { + if (caseFilter != nullptr && std::strcmp(kCases[i].name, caseFilter) != 0) { + continue; + } + int ret = RunCase(kCases[i], deviceId, stream); + if (ret != 0) { + std::fprintf(stderr, "[ERROR] case %s failed\n", kCases[i].name); + rc = 1; + break; + } + } + + if (stream != nullptr) + aclrtDestroyStream(stream); + aclrtResetDevice(deviceId); + aclFinalize(); + + return rc; +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tor/tor.pto b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tor/tor.pto new file mode 100644 index 000000000..fa593bde0 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tor/tor.pto @@ -0,0 +1,124 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// TileLang ST kernels for pto.tor: tload(a) + tload(b) + tor(a,b)->c + tstore(c). +// Multiple cases with different shapes in a single module. +// Compiled by ptoas --enable-insert-sync --enable-tile-op-expand --pto-backend=vpto +// to produce a fatobj object. + +module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { + // Case 0: i32 16x64 (1024 elements) + func.func @TOR_i32_16x64(%a_ptr: !pto.ptr, %b_ptr: !pto.ptr, %c_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c16 = arith.constant 16 : index + %c64 = arith.constant 64 : index + %c1024 = arith.constant 1024 : index + + %a_view = pto.make_tensor_view %a_ptr, + shape = [%c1, %c1, %c1, %c16, %c64], + strides = [%c1024, %c1024, %c1024, %c64, %c1] + : !pto.tensor_view<1x1x1x16x64xi32> + %b_view = pto.make_tensor_view %b_ptr, + shape = [%c1, %c1, %c1, %c16, %c64], + strides = [%c1024, %c1024, %c1024, %c64, %c1] + : !pto.tensor_view<1x1x1x16x64xi32> + %c_view = pto.make_tensor_view %c_ptr, + shape = [%c1, %c1, %c1, %c16, %c64], + strides = [%c1024, %c1024, %c1024, %c64, %c1] + : !pto.tensor_view<1x1x1x16x64xi32> + + %a_part = pto.partition_view %a_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c16, %c64] + : !pto.tensor_view<1x1x1x16x64xi32> -> !pto.partition_tensor_view<1x1x1x16x64xi32> + %b_part = pto.partition_view %b_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c16, %c64] + : !pto.tensor_view<1x1x1x16x64xi32> -> !pto.partition_tensor_view<1x1x1x16x64xi32> + %c_part = pto.partition_view %c_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c16, %c64] + : !pto.tensor_view<1x1x1x16x64xi32> -> !pto.partition_tensor_view<1x1x1x16x64xi32> + + %a = pto.alloc_tile + : !pto.tile_buf + %b = pto.alloc_tile + : !pto.tile_buf + %c = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%a_part : !pto.partition_tensor_view<1x1x1x16x64xi32>) + outs(%a : !pto.tile_buf) + pto.tload ins(%b_part : !pto.partition_tensor_view<1x1x1x16x64xi32>) + outs(%b : !pto.tile_buf) + + pto.tor ins(%a, %b : !pto.tile_buf, + !pto.tile_buf) + outs(%c : !pto.tile_buf) + + pto.tstore ins(%c : !pto.tile_buf) + outs(%c_part : !pto.partition_tensor_view<1x1x1x16x64xi32>) + return + } + + // Case 1: i32 32x32 (1024 elements) + + func.func @TOR_i32_32x32(%a_ptr: !pto.ptr, %b_ptr: !pto.ptr, %c_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c32 = arith.constant 32 : index + %c1024 = arith.constant 1024 : index + + %a_view = pto.make_tensor_view %a_ptr, + shape = [%c1, %c1, %c1, %c32, %c32], + strides = [%c1024, %c1024, %c1024, %c32, %c1] + : !pto.tensor_view<1x1x1x32x32xi32> + %b_view = pto.make_tensor_view %b_ptr, + shape = [%c1, %c1, %c1, %c32, %c32], + strides = [%c1024, %c1024, %c1024, %c32, %c1] + : !pto.tensor_view<1x1x1x32x32xi32> + %c_view = pto.make_tensor_view %c_ptr, + shape = [%c1, %c1, %c1, %c32, %c32], + strides = [%c1024, %c1024, %c1024, %c32, %c1] + : !pto.tensor_view<1x1x1x32x32xi32> + + %a_part = pto.partition_view %a_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c32, %c32] + : !pto.tensor_view<1x1x1x32x32xi32> -> !pto.partition_tensor_view<1x1x1x32x32xi32> + %b_part = pto.partition_view %b_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c32, %c32] + : !pto.tensor_view<1x1x1x32x32xi32> -> !pto.partition_tensor_view<1x1x1x32x32xi32> + %c_part = pto.partition_view %c_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c32, %c32] + : !pto.tensor_view<1x1x1x32x32xi32> -> !pto.partition_tensor_view<1x1x1x32x32xi32> + + %a = pto.alloc_tile + : !pto.tile_buf + %b = pto.alloc_tile + : !pto.tile_buf + %c = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%a_part : !pto.partition_tensor_view<1x1x1x32x32xi32>) + outs(%a : !pto.tile_buf) + pto.tload ins(%b_part : !pto.partition_tensor_view<1x1x1x32x32xi32>) + outs(%b : !pto.tile_buf) + + pto.tor ins(%a, %b : !pto.tile_buf, + !pto.tile_buf) + outs(%c : !pto.tile_buf) + + pto.tstore ins(%c : !pto.tile_buf) + outs(%c_part : !pto.partition_tensor_view<1x1x1x32x32xi32>) + return + } +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tors/CMakeLists.txt b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tors/CMakeLists.txt new file mode 100644 index 000000000..5decd02d7 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tors/CMakeLists.txt @@ -0,0 +1,9 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +pto_tilelang_vec_st(tors) diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tors/cases.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tors/cases.py new file mode 100644 index 000000000..30222ef97 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tors/cases.py @@ -0,0 +1,49 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import numpy as np + +CASES = [ + { + "name": "i32_32x64", + "dtype": np.int32, + "shape": (32, 64), + "valid_shape": (32, 64), + "eps": 0, + }, + { + "name": "i16_63x64", + "dtype": np.int16, + "shape": (63, 64), + "valid_shape": (63, 64), + "eps": 0, + }, + { + "name": "i32_31x128", + "dtype": np.int32, + "shape": (31, 128), + "valid_shape": (31, 128), + "eps": 0, + }, + { + "name": "i16_15x192", + "dtype": np.int16, + "shape": (15, 192), + "valid_shape": (15, 192), + "eps": 0, + }, +] + +_SMOKE_CASE_NAMES = ['i32_32x64', 'i16_15x192'] +_SMOKE_CASE_NAME_SET = set(_SMOKE_CASE_NAMES) +_missing = [name for name in _SMOKE_CASE_NAMES if name not in {case["name"] for case in CASES}] +if _missing: + raise RuntimeError("unknown smoke case(s): " + ", ".join(_missing)) +CASES = [case for case in CASES if case["name"] in _SMOKE_CASE_NAME_SET] diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tors/compare.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tors/compare.py new file mode 100644 index 000000000..50186777e --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tors/compare.py @@ -0,0 +1,46 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import os +import sys +import numpy as np + +from cases import CASES +from st_common import result_cmp, style_fail, style_pass, validate_cases + +def main(): + validate_cases(CASES) + case_filter = sys.argv[1] if len(sys.argv) > 1 else None + + all_passed = True + for case in CASES: + if case_filter is not None and case["name"] != case_filter: + continue + + case_dir = case["name"] + shape = case["shape"] + vr, vc = case["valid_shape"] + + golden = np.fromfile(os.path.join(case_dir, "golden.bin"), dtype=case["dtype"]).reshape(shape) + output = np.fromfile(os.path.join(case_dir, "output.bin"), dtype=case["dtype"]).reshape(shape) + + ok = result_cmp(golden[:vr, :vc], output[:vr, :vc], case["eps"]) + if ok: + print(style_pass(f"[INFO] {case['name']}: compare passed")) + else: + print(style_fail(f"[ERROR] {case['name']}: compare failed")) + all_passed = False + + if not all_passed: + sys.exit(2) + print(style_pass("[INFO] all cases passed")) + +if __name__ == "__main__": + main() diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tors/gen_data.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tors/gen_data.py new file mode 100644 index 000000000..c4c879dcd --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tors/gen_data.py @@ -0,0 +1,35 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import numpy as np +from cases import CASES +from st_common import validate_cases, setup_case_rng, save_case_data + +# Scalar value for bitwise OR (must match launch.cpp) +SCALAR = 3 + +validate_cases(CASES) + +for case in CASES: + setup_case_rng(case) + + dtype = case["dtype"] + shape = case["shape"] + valid_shape = case["valid_shape"] + + input1 = np.random.randint(1, 10, size=shape).astype(dtype) + + golden = np.zeros(shape, dtype=dtype) + vr, vc = valid_shape + scalar_val = dtype(SCALAR) + golden[:vr, :vc] = (input1[:vr, :vc] | scalar_val).astype(dtype, copy=False) + + save_case_data(case["name"], {"input1": input1, "golden": golden}) + print(f"[INFO] gen_data: {case['name']} shape={shape} valid_shape={valid_shape} dtype={dtype.__name__} scalar={SCALAR}") diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tors/launch.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tors/launch.cpp new file mode 100644 index 000000000..335f9cdc7 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tors/launch.cpp @@ -0,0 +1,32 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +#include + +#ifndef AICORE +#define AICORE [aicore] +#endif + +// Scalar value for bitwise OR (must match gen_data.py SCALAR) +static constexpr int32_t TORS_SCALAR_I32 = 3; +static constexpr int16_t TORS_SCALAR_I16 = 3; + +// Case 0: i32 32x64 + +extern "C" __global__ AICORE void TORS_i32_32x64(__gm__ int32_t *src, __gm__ int32_t *dst, int32_t scalar); +extern "C" __global__ AICORE void TORS_i16_15x192(__gm__ int16_t *src, __gm__ int16_t *dst, int16_t scalar); + +void LaunchTORS_i32_32x64(int32_t *src, int32_t *dst, void *stream) { + TORS_i32_32x64<<<1, nullptr, stream>>>((__gm__ int32_t *)src, (__gm__ int32_t *)dst, TORS_SCALAR_I32); +} + + + +void LaunchTORS_i16_15x192(int16_t *src, int16_t *dst, void *stream) { + TORS_i16_15x192<<<1, nullptr, stream>>>((__gm__ int16_t *)src, (__gm__ int16_t *)dst, TORS_SCALAR_I16); +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tors/main.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tors/main.cpp new file mode 100644 index 000000000..b242048fa --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tors/main.cpp @@ -0,0 +1,132 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// Host driver for TileLang tors ST — case-table driven. +// tors: dst = src | scalar (single input + scalar, bitwise OR). +// Numerical comparison is done externally by compare.py. + +#include "acl/acl.h" +#include "test_common.h" +#include +#include +#include +#include +#include +#include + +using namespace PtoTestCommon; + +// Kernel launch wrappers (defined in launch.cpp) +void LaunchTORS_i32_32x64(int32_t *src, int32_t *dst, void *stream); +void LaunchTORS_i16_63x64(int16_t *src, int16_t *dst, void *stream); +void LaunchTORS_i16_15x192(int16_t *src, int16_t *dst, void *stream); + +struct TestCase { + const char *name; + void (*launch)(void *, void *, void *); // src, dst, stream + size_t rows; // allocated tile rows + size_t cols; // allocated tile cols + size_t validRows; // effective computation rows (<= rows) + size_t validCols; // effective computation cols (<= cols) + size_t elemSize; // bytes per element +}; + +static const TestCase kCases[] = { +{"i32_32x64", (void (*)(void*,void*,void*))LaunchTORS_i32_32x64, 32, 64, 32, 64, sizeof(int32_t)}, +{"i16_15x192", (void (*)(void*,void*,void*))LaunchTORS_i16_15x192, 15, 192, 15, 192, sizeof(int16_t)}, +}; +static constexpr size_t kNumCases = sizeof(kCases) / sizeof(kCases[0]); + +static int RunCase(const TestCase &tc, int deviceId, aclrtStream stream) { + int rc = 0; + const size_t elemCount = tc.rows * tc.cols; + const size_t fileSize = elemCount * tc.elemSize; + + std::printf("[INFO] === case: %s (shape=%zux%zu, valid=%zux%zu) ===\n", + tc.name, tc.rows, tc.cols, tc.validRows, tc.validCols); + + // Per-case data directory + std::string caseDir = std::string("./") + tc.name; + size_t srcFileSize = fileSize; + + void *srcHost = nullptr, *dstHost = nullptr; + void *srcDevice = nullptr, *dstDevice = nullptr; + + aclrtMallocHost(&srcHost, fileSize); + aclrtMallocHost(&dstHost, fileSize); + + aclrtMalloc(&srcDevice, fileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc(&dstDevice, fileSize, ACL_MEM_MALLOC_HUGE_FIRST); + + if (!ReadFile((caseDir + "/input1.bin").c_str(), srcFileSize, srcHost, fileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input1.bin\n", caseDir.c_str()); + rc = 1; + } + + if (rc == 0) { + aclrtMemcpy(srcDevice, fileSize, srcHost, fileSize, ACL_MEMCPY_HOST_TO_DEVICE); + + tc.launch(srcDevice, dstDevice, stream); + + aclrtSynchronizeStream(stream); + aclrtMemcpy(dstHost, fileSize, dstDevice, fileSize, ACL_MEMCPY_DEVICE_TO_HOST); + } + + if (rc == 0 && !WriteFile((caseDir + "/output.bin").c_str(), dstHost, fileSize)) { + std::fprintf(stderr, "[ERROR] failed to write %s/output.bin\n", caseDir.c_str()); + rc = 1; + } + + if (srcDevice != nullptr) + aclrtFree(srcDevice); + if (dstDevice != nullptr) + aclrtFree(dstDevice); + if (srcHost != nullptr) + aclrtFreeHost(srcHost); + if (dstHost != nullptr) + aclrtFreeHost(dstHost); + + if (rc == 0) + std::printf("[INFO] case %s done\n", tc.name); + return rc; +} + +int main(int argc, char *argv[]) { + // Optional case filter: ./tors [case_name] + const char *caseFilter = (argc > 1) ? argv[1] : nullptr; + + int rc = 0; + int deviceId = 0; + aclrtStream stream = nullptr; + + aclInit(nullptr); + if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { + deviceId = std::atoi(envDevice); + } + aclrtSetDevice(deviceId); + aclrtCreateStream(&stream); + + for (size_t i = 0; i < kNumCases; ++i) { + if (caseFilter != nullptr && std::strcmp(kCases[i].name, caseFilter) != 0) { + continue; + } + int ret = RunCase(kCases[i], deviceId, stream); + if (ret != 0) { + std::fprintf(stderr, "[ERROR] case %s failed\n", kCases[i].name); + rc = 1; + break; + } + } + + if (stream != nullptr) + aclrtDestroyStream(stream); + aclrtResetDevice(deviceId); + aclFinalize(); + + return rc; +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tors/tors.pto b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tors/tors.pto new file mode 100644 index 000000000..6c25d7000 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tors/tors.pto @@ -0,0 +1,96 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// TileLang ST kernels for pto.tors: tload(src) + tors(src, scalar)->dst + tstore(dst). +// Multiple cases with different shapes/dtypes in a single module. +// Compiled by ptoas --enable-insert-sync --enable-tile-op-expand --pto-backend=vpto +// to produce a fatobj object. + +module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { + + // Case 0: i32 32x64 (2048 elements) + func.func @TORS_i32_32x64(%src_ptr: !pto.ptr, %dst_ptr: !pto.ptr, %scalar: i32) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c32 = arith.constant 32 : index + %c64 = arith.constant 64 : index + %c2048 = arith.constant 2048 : index + + %src_view = pto.make_tensor_view %src_ptr, + shape = [%c1, %c1, %c1, %c32, %c64], + strides = [%c2048, %c2048, %c2048, %c64, %c1] + : !pto.tensor_view<1x1x1x32x64xi32> + %dst_view = pto.make_tensor_view %dst_ptr, + shape = [%c1, %c1, %c1, %c32, %c64], + strides = [%c2048, %c2048, %c2048, %c64, %c1] + : !pto.tensor_view<1x1x1x32x64xi32> + + %src_part = pto.partition_view %src_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c32, %c64] + : !pto.tensor_view<1x1x1x32x64xi32> -> !pto.partition_tensor_view<1x1x1x32x64xi32> + %dst_part = pto.partition_view %dst_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c32, %c64] + : !pto.tensor_view<1x1x1x32x64xi32> -> !pto.partition_tensor_view<1x1x1x32x64xi32> + + %src = pto.alloc_tile + : !pto.tile_buf + %dst = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%src_part : !pto.partition_tensor_view<1x1x1x32x64xi32>) + outs(%src : !pto.tile_buf) + pto.tors ins(%src, %scalar : !pto.tile_buf, i32) + outs(%dst : !pto.tile_buf) + pto.tstore ins(%dst : !pto.tile_buf) + outs(%dst_part : !pto.partition_tensor_view<1x1x1x32x64xi32>) + return + } + + // Case 1: i16 63x64 (4032 elements) + + func.func @TORS_i16_15x192(%src_ptr: !pto.ptr, %dst_ptr: !pto.ptr, %scalar: i16) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c15 = arith.constant 15 : index + %c192 = arith.constant 192 : index + %c2880 = arith.constant 2880 : index + + %src_view = pto.make_tensor_view %src_ptr, + shape = [%c1, %c1, %c1, %c15, %c192], + strides = [%c2880, %c2880, %c2880, %c192, %c1] + : !pto.tensor_view<1x1x1x15x192xi16> + %dst_view = pto.make_tensor_view %dst_ptr, + shape = [%c1, %c1, %c1, %c15, %c192], + strides = [%c2880, %c2880, %c2880, %c192, %c1] + : !pto.tensor_view<1x1x1x15x192xi16> + + %src_part = pto.partition_view %src_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c15, %c192] + : !pto.tensor_view<1x1x1x15x192xi16> -> !pto.partition_tensor_view<1x1x1x15x192xi16> + %dst_part = pto.partition_view %dst_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c15, %c192] + : !pto.tensor_view<1x1x1x15x192xi16> -> !pto.partition_tensor_view<1x1x1x15x192xi16> + + %src = pto.alloc_tile + : !pto.tile_buf + %dst = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%src_part : !pto.partition_tensor_view<1x1x1x15x192xi16>) + outs(%src : !pto.tile_buf) + pto.tors ins(%src, %scalar : !pto.tile_buf, i16) + outs(%dst : !pto.tile_buf) + pto.tstore ins(%dst : !pto.tile_buf) + outs(%dst_part : !pto.partition_tensor_view<1x1x1x15x192xi16>) + return + } +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tpartadd/CMakeLists.txt b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tpartadd/CMakeLists.txt new file mode 100644 index 000000000..2b317621d --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tpartadd/CMakeLists.txt @@ -0,0 +1,9 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +pto_tilelang_vec_st(tpartadd) diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tpartadd/cases.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tpartadd/cases.py new file mode 100644 index 000000000..0a510e65c --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tpartadd/cases.py @@ -0,0 +1,129 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +"""Single source of truth for tpartadd ST test cases. + +Each case defines: + - name: case identifier, used as subdirectory name and by main.cpp kCases[]. + - dtype: numpy dtype (e.g. np.float32). + - shape: (rows, cols) — allocated tile dimensions (same for src0/src1/dst). + - valid_shape: (valid_rows, valid_cols) — src0 valid region (src0_eq_dst scenario). + - src1_vshape: (src1_valid_rows, src1_valid_cols) — src1 valid region. + May be smaller than dst valid region for partial add cases. + - dst_vshape: (dst_valid_rows, dst_valid_cols) — dst valid region. + - eps: tolerance for numpy.allclose (atol and rtol). + +tpartadd semantics: + - If src0_valid == dst_valid: dst[:src1_rows,:src1_cols] = src0[:src1_rows,:src1_cols] + src1[:src1_rows,:src1_cols] + dst[src1_rows:,:] = src0[src1_rows:,:] (copy remaining rows) + OR (for col_less) dst[:,:src1_cols] = src0[:,:src1_cols] + src1[:,:src1_cols] + dst[:,src1_cols:] = src0[:,src1_cols:] (copy remaining cols) + - If src1_valid == dst_valid: similar logic with src1 as the full operand. + +gen_data.py and compare.py both import this list to avoid redundant definitions. +""" + +import numpy as np + +CASES = [ + # float16 cases + { + "name": "f16_8x48_src0_col_less", + "dtype": np.float16, + "shape": (8, 48), + "valid_shape": (8, 16), # src0 valid region (col_less) + "src1_vshape": (8, 48), # src1 valid region (equals dst) + "dst_vshape": (8, 48), # dst valid region + "eps": 1e-3, + }, + # float32 cases + { + "name": "f32_64x64_full", + "dtype": np.float32, + "shape": (64, 64), + "valid_shape": (64, 64), # src0 valid region + "src1_vshape": (64, 64), # src1 valid region (same as dst) + "dst_vshape": (64, 64), # dst valid region + "eps": 1e-6, + }, + { + "name": "f32_64x64_src0_row_less", + "dtype": np.float32, + "shape": (64, 64), + "valid_shape": (8, 64), # src0 valid region (row_less) + "src1_vshape": (64, 64), # src1 valid region (equals dst) + "dst_vshape": (64, 64), # dst valid region + "eps": 1e-6, + }, + { + "name": "f32_64x64_src0_col_less", + "dtype": np.float32, + "shape": (64, 64), + "valid_shape": (64, 8), # src0 valid region (col_less) + "src1_vshape": (64, 64), # src1 valid region (equals dst) + "dst_vshape": (64, 64), # dst valid region + "eps": 1e-6, + }, + { + "name": "f32_64x64_src1_row_less", + "dtype": np.float32, + "shape": (64, 64), + "valid_shape": (64, 64), # src0 valid region (equals dst) + "src1_vshape": (8, 64), # src1 valid region (row_less) + "dst_vshape": (64, 64), # dst valid region + "eps": 1e-6, + }, + { + "name": "f32_64x64_src1_col_less", + "dtype": np.float32, + "shape": (64, 64), + "valid_shape": (64, 64), # src0 valid region (equals dst) + "src1_vshape": (64, 8), # src1 valid region (col_less) + "dst_vshape": (64, 64), # dst valid region + "eps": 1e-6, + }, + { + "name": "f16_8x768_src0_col_less", + "dtype": np.float16, + "shape": (8, 768), + "valid_shape": (8, 512), # src0 valid region (col_less) + "src1_vshape": (8, 768), # src1 valid region (equals dst) + "dst_vshape": (8, 768), # dst valid region + "eps": 1e-3, + }, + # int16 cases + { + "name": "i16_8x48_src1_col_less", + "dtype": np.int16, + "shape": (8, 48), + "valid_shape": (8, 48), # src0 valid region (equals dst) + "src1_vshape": (8, 16), # src1 valid region (col_less) + "dst_vshape": (8, 48), # dst valid region + "eps": 0, # exact match for int + }, + # int32 cases + { + "name": "i32_64x64_src0_row_less", + "dtype": np.int32, + "shape": (64, 64), + "valid_shape": (8, 64), # src0 valid region (row_less) + "src1_vshape": (64, 64), # src1 valid region (equals dst) + "dst_vshape": (64, 64), # dst valid region + "eps": 0, # exact match for int + }, +] + +_SMOKE_CASE_NAMES = ['f16_8x48_src0_col_less', 'i16_8x48_src1_col_less'] +_SMOKE_CASE_NAME_SET = set(_SMOKE_CASE_NAMES) +_missing = [name for name in _SMOKE_CASE_NAMES if name not in {case["name"] for case in CASES}] +if _missing: + raise RuntimeError("unknown smoke case(s): " + ", ".join(_missing)) +CASES = [case for case in CASES if case["name"] in _SMOKE_CASE_NAME_SET] diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tpartadd/compare.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tpartadd/compare.py new file mode 100644 index 000000000..296e28e22 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tpartadd/compare.py @@ -0,0 +1,53 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import os +import sys +import numpy as np + +# Add parent directory to path for st_common import +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +from st_common import result_cmp, style_fail, style_pass + +from cases import CASES + + +def main(): + case_filter = sys.argv[1] if len(sys.argv) > 1 else None + + all_passed = True + for case in CASES: + if case_filter is not None and case["name"] != case_filter: + continue + + case_dir = case["name"] + shape = case["shape"] + dtype = case["dtype"] + dst_vr, dst_vc = case["dst_vshape"] + + golden = np.fromfile(os.path.join(case_dir, "golden.bin"), dtype=dtype).reshape(shape) + output = np.fromfile(os.path.join(case_dir, "output.bin"), dtype=dtype).reshape(shape) + + # Compare only the dst valid region + ok = result_cmp(golden[:dst_vr, :dst_vc], output[:dst_vr, :dst_vc], case["eps"]) + if ok: + print(style_pass(f"[INFO] {case['name']}: compare passed")) + else: + print(style_fail(f"[ERROR] {case['name']}: compare failed")) + all_passed = False + + if not all_passed: + sys.exit(2) + print(style_pass("[INFO] all cases passed")) + + +if __name__ == "__main__": + main() diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tpartadd/gen_data.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tpartadd/gen_data.py new file mode 100644 index 000000000..e5333735c --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tpartadd/gen_data.py @@ -0,0 +1,96 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import numpy as np +import os +import sys + +# Add parent directory to path for st_common import +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +from st_common import setup_case_rng, save_case_data + +from cases import CASES + + +def _to_tuple(shape): + """Convert shape to tuple if needed.""" + if isinstance(shape, tuple): + return shape + return tuple(shape) + + +for case in CASES: + setup_case_rng(case) + + dtype = case["dtype"] + shape = _to_tuple(case["shape"]) + src0_valid = _to_tuple(case["valid_shape"]) + src1_valid = _to_tuple(case["src1_vshape"]) + dst_valid = _to_tuple(case["dst_vshape"]) + + rows, cols = shape + src0_vr, src0_vc = src0_valid + src1_vr, src1_vc = src1_valid + dst_vr, dst_vc = dst_valid + + input1 = np.random.randint(1, 10, size=shape).astype(dtype) + input2 = np.random.randint(1, 10, size=shape).astype(dtype) + + golden = np.zeros(shape, dtype=dtype) + + # Compute golden according to tpartadd semantics from template: + # If src0_valid == dst_valid: use tpart_op with src0 as full operand + # - If src1 row_less: add for src1 region, copy src0 for remaining rows + # - If src1 col_less: copy src0 full, then add for overlapping region + # If src1_valid == dst_valid: use tpart_op with src1 as full operand (swap src0/src1) + + src0_eq_dst = (src0_vr == dst_vr and src0_vc == dst_vc) + src1_eq_dst = (src1_vr == dst_vr and src1_vc == dst_vc) + + if src0_eq_dst: + # src0 is the full operand matching dst + src1_row_lt_dst = (src1_vr < dst_vr and src1_vc == dst_vc) + src1_col_lt_dst = (src1_vr <= dst_vr and src1_vc < dst_vc) + + if src1_eq_dst: + # Full add: dst[:] = src0[:] + src1[:] + golden[:dst_vr, :dst_vc] = (input1[:dst_vr, :dst_vc] + input2[:dst_vr, :dst_vc]).astype(dtype, copy=False) + elif src1_col_lt_dst: + # Col_less: first copy src0, then add in overlapping region + golden[:dst_vr, :dst_vc] = input1[:dst_vr, :dst_vc].copy() + if src1_vc > 0: + golden[:src1_vr, :src1_vc] = (input1[:src1_vr, :src1_vc] + input2[:src1_vr, :src1_vc]).astype(dtype, copy=False) + elif src1_row_lt_dst: + # Row_less: add for src1 region, copy src0 for remaining rows + if src1_vc > 0: + golden[:src1_vr, :src1_vc] = (input1[:src1_vr, :src1_vc] + input2[:src1_vr, :src1_vc]).astype(dtype, copy=False) + golden[src1_vr:dst_vr, :dst_vc] = input1[src1_vr:dst_vr, :dst_vc].copy() + elif src1_eq_dst: + # src1 is the full operand matching dst, swap src0/src1 in the logic + src0_row_lt_dst = (src0_vr < dst_vr and src0_vc == dst_vc) + src0_col_lt_dst = (src0_vr <= dst_vr and src0_vc < dst_vc) + + if src0_eq_dst: + # Full add: dst[:] = src0[:] + src1[:] + golden[:dst_vr, :dst_vc] = (input1[:dst_vr, :dst_vc] + input2[:dst_vr, :dst_vc]).astype(dtype, copy=False) + elif src0_col_lt_dst: + # Col_less: first copy src1, then add in overlapping region + golden[:dst_vr, :dst_vc] = input2[:dst_vr, :dst_vc].copy() + if src0_vc > 0: + golden[:src0_vr, :src0_vc] = (input1[:src0_vr, :src0_vc] + input2[:src0_vr, :src0_vc]).astype(dtype, copy=False) + elif src0_row_lt_dst: + # Row_less: add for src0 region, copy src1 for remaining rows + if src0_vc > 0: + golden[:src0_vr, :src0_vc] = (input1[:src0_vr, :src0_vc] + input2[:src0_vr, :src0_vc]).astype(dtype, copy=False) + golden[src0_vr:dst_vr, :dst_vc] = input2[src0_vr:dst_vr, :dst_vc].copy() + + save_case_data(case["name"], {"input1": input1, "input2": input2, "golden": golden}) + print(f"[INFO] gen_data: {case['name']} shape={shape} src0_valid={src0_valid} src1_valid={src1_valid} dst_valid={dst_valid} dtype={dtype.__name__}") diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tpartadd/launch.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tpartadd/launch.cpp new file mode 100644 index 000000000..560a0a4a1 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tpartadd/launch.cpp @@ -0,0 +1,28 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +#include + +#ifndef AICORE +#define AICORE [aicore] +#endif + +// Case 0: f32 64x64 full + +extern "C" __global__ AICORE void TPARTADD_f16_8x48_src0_col_less(__gm__ uint16_t *a, __gm__ uint16_t *b, __gm__ uint16_t *c); +extern "C" __global__ AICORE void TPARTADD_i16_8x48_src1_col_less(__gm__ int16_t *a, __gm__ int16_t *b, __gm__ int16_t *c); + +void LaunchTPARTADD_f16_8x48_src0_col_less(uint16_t *a, uint16_t *b, uint16_t *c, void *stream) { + TPARTADD_f16_8x48_src0_col_less<<<1, nullptr, stream>>>((__gm__ uint16_t *)a, (__gm__ uint16_t *)b, (__gm__ uint16_t *)c); +} + + + +void LaunchTPARTADD_i16_8x48_src1_col_less(int16_t *a, int16_t *b, int16_t *c, void *stream) { + TPARTADD_i16_8x48_src1_col_less<<<1, nullptr, stream>>>((__gm__ int16_t *)a, (__gm__ int16_t *)b, (__gm__ int16_t *)c); +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tpartadd/main.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tpartadd/main.cpp new file mode 100644 index 000000000..7235d212c --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tpartadd/main.cpp @@ -0,0 +1,152 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// Host driver for TileLang tpartadd ST — case-table driven. +// Each case launches a different kernel variant, reads/writes from per-case subdirectory. +// Numerical comparison is done externally by compare.py. + +#include "acl/acl.h" +#include "test_common.h" +#include +#include +#include +#include +#include +#include + +using namespace PtoTestCommon; + +// Kernel launch wrappers (defined in launch.cpp) +void LaunchTPARTADD_f32_64x64_src0_row_less(float *a, float *b, float *c, void *stream); +void LaunchTPARTADD_f32_64x64_src1_row_less(float *a, float *b, float *c, void *stream); +void LaunchTPARTADD_f16_8x48_src0_col_less(uint16_t *a, uint16_t *b, uint16_t *c, void *stream); +void LaunchTPARTADD_i16_8x48_src1_col_less(int16_t *a, int16_t *b, int16_t *c, void *stream); + +using LaunchFn = void (*)(void *, void *, void *, void *); + +struct TestCase { + const char *name; + LaunchFn launch; + size_t rows; // allocated tile rows + size_t cols; // allocated tile cols + size_t src0ValidRows; // src0 effective rows + size_t src0ValidCols; // src0 effective cols + size_t src1ValidRows; // src1 effective rows + size_t src1ValidCols; // src1 effective cols + size_t dstValidRows; // dst effective rows + size_t dstValidCols; // dst effective cols + size_t elemSize; // bytes per element +}; + +static const TestCase kCases[] = { +{"f16_8x48_src0_col_less", reinterpret_cast(LaunchTPARTADD_f16_8x48_src0_col_less), 8, 48, 8, 16, 8, 48, 8, 48, sizeof(uint16_t)}, +{"i16_8x48_src1_col_less", reinterpret_cast(LaunchTPARTADD_i16_8x48_src1_col_less), 8, 48, 8, 48, 8, 16, 8, 48, sizeof(int16_t)}, +}; +static constexpr size_t kNumCases = sizeof(kCases) / sizeof(kCases[0]); + +static int RunCase(const TestCase &tc, int deviceId, aclrtStream stream) { + int rc = 0; + const size_t elemCount = tc.rows * tc.cols; + const size_t fileSize = elemCount * tc.elemSize; + + std::printf("[INFO] === case: %s (shape=%zux%zu, src0_valid=%zux%zu, src1_valid=%zux%zu, dst_valid=%zux%zu) ===\n", + tc.name, tc.rows, tc.cols, tc.src0ValidRows, tc.src0ValidCols, + tc.src1ValidRows, tc.src1ValidCols, tc.dstValidRows, tc.dstValidCols); + + // Per-case data directory + std::string caseDir = std::string("./") + tc.name; + size_t src0FileSize = fileSize; + size_t src1FileSize = fileSize; + + void *src0Host = nullptr, *src1Host = nullptr, *dstHost = nullptr; + void *src0Device = nullptr, *src1Device = nullptr, *dstDevice = nullptr; + + aclrtMallocHost((void **)(&src0Host), fileSize); + aclrtMallocHost((void **)(&src1Host), fileSize); + aclrtMallocHost((void **)(&dstHost), fileSize); + + aclrtMalloc((void **)&src0Device, fileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc((void **)&src1Device, fileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc((void **)&dstDevice, fileSize, ACL_MEM_MALLOC_HUGE_FIRST); + + if (!ReadFile((caseDir + "/input1.bin").c_str(), src0FileSize, src0Host, fileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input1.bin\n", caseDir.c_str()); + rc = 1; + } + if (rc == 0 && !ReadFile((caseDir + "/input2.bin").c_str(), src1FileSize, src1Host, fileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input2.bin\n", caseDir.c_str()); + rc = 1; + } + + if (rc == 0) { + aclrtMemcpy(src0Device, fileSize, src0Host, fileSize, ACL_MEMCPY_HOST_TO_DEVICE); + aclrtMemcpy(src1Device, fileSize, src1Host, fileSize, ACL_MEMCPY_HOST_TO_DEVICE); + + tc.launch(src0Device, src1Device, dstDevice, stream); + + aclrtSynchronizeStream(stream); + aclrtMemcpy(dstHost, fileSize, dstDevice, fileSize, ACL_MEMCPY_DEVICE_TO_HOST); + } + + if (rc == 0 && !WriteFile((caseDir + "/output.bin").c_str(), dstHost, fileSize)) { + std::fprintf(stderr, "[ERROR] failed to write %s/output.bin\n", caseDir.c_str()); + rc = 1; + } + + if (src0Device != nullptr) + aclrtFree(src0Device); + if (src1Device != nullptr) + aclrtFree(src1Device); + if (dstDevice != nullptr) + aclrtFree(dstDevice); + if (src0Host != nullptr) + aclrtFreeHost(src0Host); + if (src1Host != nullptr) + aclrtFreeHost(src1Host); + if (dstHost != nullptr) + aclrtFreeHost(dstHost); + + if (rc == 0) + std::printf("[INFO] case %s done\n", tc.name); + return rc; +} + +int main(int argc, char *argv[]) { + // Optional case filter: ./tpartadd [case_name] + const char *caseFilter = (argc > 1) ? argv[1] : nullptr; + + int rc = 0; + int deviceId = 0; + aclrtStream stream = nullptr; + + aclInit(nullptr); + if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { + deviceId = std::atoi(envDevice); + } + aclrtSetDevice(deviceId); + aclrtCreateStream(&stream); + + for (size_t i = 0; i < kNumCases; ++i) { + if (caseFilter != nullptr && std::strcmp(kCases[i].name, caseFilter) != 0) { + continue; + } + int ret = RunCase(kCases[i], deviceId, stream); + if (ret != 0) { + std::fprintf(stderr, "[ERROR] case %s failed\n", kCases[i].name); + rc = 1; + break; + } + } + + if (stream != nullptr) + aclrtDestroyStream(stream); + aclrtResetDevice(deviceId); + aclFinalize(); + + return rc; +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tpartadd/tpartadd.pto b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tpartadd/tpartadd.pto new file mode 100644 index 000000000..c23085457 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tpartadd/tpartadd.pto @@ -0,0 +1,135 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use the file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// TileLang ST kernels for pto.tpartadd: partial elementwise add with valid region handling. +// Multiple cases with different valid_shape combinations in a single module. +// Compiled by ptoas --enable-insert-sync --enable-tile-op-expand --pto-backend=vpto +// to produce a fatobj object. + +module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { + // Case 0: f32 64x64 full (src0/src1/dst all have same valid_shape 64x64) + func.func @TPARTADD_f16_8x48_src0_col_less(%a_ptr: !pto.ptr, %b_ptr: !pto.ptr, %c_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c8 = arith.constant 8 : index + %c16 = arith.constant 16 : index + %c48 = arith.constant 48 : index + %c384 = arith.constant 384 : index + + %a_view = pto.make_tensor_view %a_ptr, + shape = [%c1, %c1, %c1, %c8, %c48], + strides = [%c384, %c384, %c384, %c48, %c1] + : !pto.tensor_view<1x1x1x8x48xf16> + %b_view = pto.make_tensor_view %b_ptr, + shape = [%c1, %c1, %c1, %c8, %c48], + strides = [%c384, %c384, %c384, %c48, %c1] + : !pto.tensor_view<1x1x1x8x48xf16> + %c_view = pto.make_tensor_view %c_ptr, + shape = [%c1, %c1, %c1, %c8, %c48], + strides = [%c384, %c384, %c384, %c48, %c1] + : !pto.tensor_view<1x1x1x8x48xf16> + + %a_part = pto.partition_view %a_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c8, %c48] + : !pto.tensor_view<1x1x1x8x48xf16> -> !pto.partition_tensor_view<1x1x1x8x48xf16> + %b_part = pto.partition_view %b_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c8, %c48] + : !pto.tensor_view<1x1x1x8x48xf16> -> !pto.partition_tensor_view<1x1x1x8x48xf16> + %c_part = pto.partition_view %c_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c8, %c48] + : !pto.tensor_view<1x1x1x8x48xf16> -> !pto.partition_tensor_view<1x1x1x8x48xf16> + + // src0: partial valid region (8,16) + %a = pto.alloc_tile + : !pto.tile_buf + // src1: full valid region (8,48) + %b = pto.alloc_tile + : !pto.tile_buf + // dst: full valid region (8,48) + %c = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%a_part : !pto.partition_tensor_view<1x1x1x8x48xf16>) + outs(%a : !pto.tile_buf) + pto.tload ins(%b_part : !pto.partition_tensor_view<1x1x1x8x48xf16>) + outs(%b : !pto.tile_buf) + + pto.tpartadd ins(%a, %b : !pto.tile_buf, + !pto.tile_buf) + outs(%c : !pto.tile_buf) + + pto.tstore ins(%c : !pto.tile_buf) + outs(%c_part : !pto.partition_tensor_view<1x1x1x8x48xf16>) + return + } + + // Case 6: f16 8x768 src0 col less (src0 valid 8x512, src1/dst valid 8x768) + + func.func @TPARTADD_i16_8x48_src1_col_less(%a_ptr: !pto.ptr, %b_ptr: !pto.ptr, %c_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c8 = arith.constant 8 : index + %c16 = arith.constant 16 : index + %c48 = arith.constant 48 : index + %c384 = arith.constant 384 : index + + %a_view = pto.make_tensor_view %a_ptr, + shape = [%c1, %c1, %c1, %c8, %c48], + strides = [%c384, %c384, %c384, %c48, %c1] + : !pto.tensor_view<1x1x1x8x48xi16> + %b_view = pto.make_tensor_view %b_ptr, + shape = [%c1, %c1, %c1, %c8, %c48], + strides = [%c384, %c384, %c384, %c48, %c1] + : !pto.tensor_view<1x1x1x8x48xi16> + %c_view = pto.make_tensor_view %c_ptr, + shape = [%c1, %c1, %c1, %c8, %c48], + strides = [%c384, %c384, %c384, %c48, %c1] + : !pto.tensor_view<1x1x1x8x48xi16> + + %a_part = pto.partition_view %a_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c8, %c48] + : !pto.tensor_view<1x1x1x8x48xi16> -> !pto.partition_tensor_view<1x1x1x8x48xi16> + %b_part = pto.partition_view %b_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c8, %c48] + : !pto.tensor_view<1x1x1x8x48xi16> -> !pto.partition_tensor_view<1x1x1x8x48xi16> + %c_part = pto.partition_view %c_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c8, %c48] + : !pto.tensor_view<1x1x1x8x48xi16> -> !pto.partition_tensor_view<1x1x1x8x48xi16> + + // src0: full valid region (8,48) + %a = pto.alloc_tile + : !pto.tile_buf + // src1: partial valid region (8,16) + %b = pto.alloc_tile + : !pto.tile_buf + // dst: full valid region (8,48) + %c = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%a_part : !pto.partition_tensor_view<1x1x1x8x48xi16>) + outs(%a : !pto.tile_buf) + pto.tload ins(%b_part : !pto.partition_tensor_view<1x1x1x8x48xi16>) + outs(%b : !pto.tile_buf) + + pto.tpartadd ins(%a, %b : !pto.tile_buf, + !pto.tile_buf) + outs(%c : !pto.tile_buf) + + pto.tstore ins(%c : !pto.tile_buf) + outs(%c_part : !pto.partition_tensor_view<1x1x1x8x48xi16>) + return + } + + // Case 8: i32 64x64 src0 row less (src0 valid 8x64, src1/dst valid 64x64) +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tpartmax/CMakeLists.txt b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tpartmax/CMakeLists.txt new file mode 100644 index 000000000..f7cabd4af --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tpartmax/CMakeLists.txt @@ -0,0 +1,9 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +pto_tilelang_vec_st(tpartmax) diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tpartmax/cases.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tpartmax/cases.py new file mode 100644 index 000000000..f1d4a70ba --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tpartmax/cases.py @@ -0,0 +1,160 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +"""Single source of truth for tpartmax ST test cases. + +Each case defines: + - name: case identifier, used as subdirectory name and by main.cpp kCases[]. + - dtype: numpy dtype (e.g. np.float32). + - shape: (rows, cols) — allocated tile dimensions (same for src0/src1/dst). + - valid_shape: (valid_rows, valid_cols) — src0 valid region (src0_eq_dst scenario). + - src1_vshape: (src1_valid_rows, src1_valid_cols) — src1 valid region. + May be smaller than dst valid region for partial max cases. + - dst_vshape: (dst_valid_rows, dst_valid_cols) — dst valid region. + - eps: tolerance for numpy.allclose (atol and rtol). + +tpartmax semantics: + - If src0_valid == dst_valid: dst[:src1_rows,:src1_cols] = max(src0[:src1_rows,:src1_cols], src1[:src1_rows,:src1_cols]) + dst[src1_rows:,:] = src0[src1_rows:,:] (copy remaining rows) + OR (for col_less) dst[:,:src1_cols] = max(src0[:,:src1_cols], src1[:,:src1_cols]) + dst[:,src1_cols:] = src0[:,src1_cols:] (copy remaining cols) + - If src1_valid == dst_valid: similar logic with src1 as the full operand. + +gen_data.py and compare.py both import this list to avoid redundant definitions. +""" + +import numpy as np + +CASES = [ + { + "name": "f32_2x24_src1_col_less", + "dtype": np.float32, + "shape": (2, 24), + "valid_shape": (2, 24), # src0 valid region (equals dst) + "src1_vshape": (2, 8), # src1 valid region (col_less) + "dst_vshape": (2, 24), # dst valid region + "eps": 1e-6, + }, + # float32 cases from pto-isa + { + "name": "f32_64x64_full", + "dtype": np.float32, + "shape": (64, 64), + "valid_shape": (64, 64), # src0 valid region + "src1_vshape": (64, 64), # src1 valid region (same as dst) + "dst_vshape": (64, 64), # dst valid region + "eps": 1e-6, + }, + { + "name": "f32_128x64_src1_row_less", + "dtype": np.float32, + "shape": (128, 64), + "valid_shape": (128, 64), # src0 valid region (equals dst) + "src1_vshape": (96, 64), # src1 valid region (row_less) + "dst_vshape": (128, 64), # dst valid region + "eps": 1e-6, + }, + { + "name": "f32_95x95_full", + "dtype": np.float32, + "shape": (95, 95), + "valid_shape": (95, 95), # src0 valid region + "src1_vshape": (95, 95), # src1 valid region (same as dst) + "dst_vshape": (95, 95), # dst valid region + "eps": 1e-6, + }, + { + "name": "f32_122x123_complex", + "dtype": np.float32, + "shape": (122, 123), + "valid_shape": (104, 123), # src0 valid region + "src1_vshape": (122, 110), # src1 valid region + "dst_vshape": (122, 123), # dst valid region (src1 rows, src0 cols) + "eps": 1e-6, + }, + # float16 cases from pto-isa + { + "name": "f16_122x123_complex", + "dtype": np.float16, + "shape": (122, 123), + "valid_shape": (104, 123), # src0 valid region + "src1_vshape": (122, 110), # src1 valid region + "dst_vshape": (122, 123), # dst valid region + "eps": 1e-3, + }, + # int16 cases from pto-isa + { + "name": "i16_122x123_complex", + "dtype": np.int16, + "shape": (122, 123), + "valid_shape": (104, 123), # src0 valid region + "src1_vshape": (122, 110), # src1 valid region + "dst_vshape": (122, 123), # dst valid region + "eps": 0, + }, + # int32 cases from pto-isa + { + "name": "i32_122x123_complex", + "dtype": np.int32, + "shape": (122, 123), + "valid_shape": (104, 123), # src0 valid region + "src1_vshape": (122, 110), # src1 valid region + "dst_vshape": (122, 123), # dst valid region + "eps": 0, + }, + # uint16 cases from pto-isa + { + "name": "u16_122x123_complex", + "dtype": np.uint16, + "shape": (122, 123), + "valid_shape": (104, 123), # src0 valid region + "src1_vshape": (122, 110), # src1 valid region + "dst_vshape": (122, 123), # dst valid region + "eps": 0, + }, + # uint32 cases from pto-isa + { + "name": "u32_122x123_complex", + "dtype": np.uint32, + "shape": (122, 123), + "valid_shape": (104, 123), # src0 valid region + "src1_vshape": (122, 110), # src1 valid region + "dst_vshape": (122, 123), # dst valid region + "eps": 0, + }, + # int8 cases from pto-isa + { + "name": "i8_122x123_complex", + "dtype": np.int8, + "shape": (122, 123), + "valid_shape": (104, 123), # src0 valid region + "src1_vshape": (122, 110), # src1 valid region + "dst_vshape": (122, 123), # dst valid region + "eps": 0, + }, + # uint8 cases from pto-isa + { + "name": "u8_122x123_complex", + "dtype": np.uint8, + "shape": (122, 123), + "valid_shape": (104, 123), # src0 valid region + "src1_vshape": (122, 110), # src1 valid region + "dst_vshape": (122, 123), # dst valid region + "eps": 0, + }, +] + +_SMOKE_CASE_NAMES = ['f32_2x24_src1_col_less', 'f32_64x64_full'] +_SMOKE_CASE_NAME_SET = set(_SMOKE_CASE_NAMES) +_missing = [name for name in _SMOKE_CASE_NAMES if name not in {case["name"] for case in CASES}] +if _missing: + raise RuntimeError("unknown smoke case(s): " + ", ".join(_missing)) +CASES = [case for case in CASES if case["name"] in _SMOKE_CASE_NAME_SET] diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tpartmax/compare.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tpartmax/compare.py new file mode 100644 index 000000000..296e28e22 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tpartmax/compare.py @@ -0,0 +1,53 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import os +import sys +import numpy as np + +# Add parent directory to path for st_common import +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +from st_common import result_cmp, style_fail, style_pass + +from cases import CASES + + +def main(): + case_filter = sys.argv[1] if len(sys.argv) > 1 else None + + all_passed = True + for case in CASES: + if case_filter is not None and case["name"] != case_filter: + continue + + case_dir = case["name"] + shape = case["shape"] + dtype = case["dtype"] + dst_vr, dst_vc = case["dst_vshape"] + + golden = np.fromfile(os.path.join(case_dir, "golden.bin"), dtype=dtype).reshape(shape) + output = np.fromfile(os.path.join(case_dir, "output.bin"), dtype=dtype).reshape(shape) + + # Compare only the dst valid region + ok = result_cmp(golden[:dst_vr, :dst_vc], output[:dst_vr, :dst_vc], case["eps"]) + if ok: + print(style_pass(f"[INFO] {case['name']}: compare passed")) + else: + print(style_fail(f"[ERROR] {case['name']}: compare failed")) + all_passed = False + + if not all_passed: + sys.exit(2) + print(style_pass("[INFO] all cases passed")) + + +if __name__ == "__main__": + main() diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tpartmax/gen_data.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tpartmax/gen_data.py new file mode 100644 index 000000000..572c1a65b --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tpartmax/gen_data.py @@ -0,0 +1,127 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import numpy as np +import os +import sys + +# Add parent directory to path for st_common import +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +from st_common import setup_case_rng, save_case_data + +from cases import CASES + + +def _to_tuple(shape): + """Convert shape to tuple if needed.""" + if isinstance(shape, tuple): + return shape + return tuple(shape) + + +for case in CASES: + setup_case_rng(case) + + dtype = case["dtype"] + shape = _to_tuple(case["shape"]) + src0_valid = _to_tuple(case["valid_shape"]) + src1_valid = _to_tuple(case["src1_vshape"]) + dst_valid = _to_tuple(case["dst_vshape"]) + + rows, cols = shape + src0_vr, src0_vc = src0_valid + src1_vr, src1_vc = src1_valid + dst_vr, dst_vc = dst_valid + + input1 = np.random.randint(1, 10, size=shape).astype(dtype) + input2 = np.random.randint(1, 10, size=shape).astype(dtype) + + golden = np.zeros(shape, dtype=dtype) + + # tpartmax semantics (based on pto-isa TPartBinOps.hpp TCopyPadOp): + # Algorithm: + # 1. dst[:] = Min (padding for max operation) + # 2. dst[0:src0_vr, 0:src0_vc] = src0[0:src0_vr, 0:src0_vc] (copy src0 to dst) + # 3. dst[0:src1_vr, 0:src1_vc] = max(dst[0:src1_vr, 0:src1_vc], src1[0:src1_vr, 0:src1_vc]) + # (apply max in src1 valid region) + + src0_eq_dst = (src0_vr == dst_vr and src0_vc == dst_vc) + src1_eq_dst = (src1_vr == dst_vr and src1_vc == dst_vc) + + if src0_eq_dst and src1_eq_dst: + # Full max: both src0 and src1 cover entire dst + golden[:dst_vr, :dst_vc] = np.maximum(input1[:dst_vr, :dst_vc], input2[:dst_vr, :dst_vc]).astype(dtype, copy=False) + elif src0_eq_dst: + # src0 covers dst, src1 is partial + # dst = src0 (copy), then max(dst, src1) in src1 region = max(src0, src1) in src1 region, src0 in rest + golden[:src1_vr, :src1_vc] = np.maximum(input1[:src1_vr, :src1_vc], input2[:src1_vr, :src1_vc]).astype(dtype, copy=False) + if src1_vc < dst_vc: + golden[:src1_vr, src1_vc:dst_vc] = input1[:src1_vr, src1_vc:dst_vc].copy() + if src1_vr < dst_vr: + golden[src1_vr:dst_vr, :dst_vc] = input1[src1_vr:dst_vr, :dst_vc].copy() + elif src1_eq_dst: + # src1 covers dst, src0 is partial + # dst = Min, then copy src0 in src0 region, then max(dst, src1) in src1 region + golden[:src0_vr, :src0_vc] = np.maximum(input1[:src0_vr, :src0_vc], input2[:src0_vr, :src0_vc]).astype(dtype, copy=False) + if src0_vc < dst_vc: + golden[:src0_vr, src0_vc:dst_vc] = input2[:src0_vr, src0_vc:dst_vc].copy() + if src0_vr < dst_vr: + golden[src0_vr:dst_vr, :dst_vc] = input2[src0_vr:dst_vr, :dst_vc].copy() + else: + min_vr = min(src0_vr, src1_vr) + min_vc = min(src0_vc, src1_vc) + + # Region 1: [0:min_vr, 0:min_vc] - overlapping region (both src0 and src1 valid) + golden[:min_vr, :min_vc] = np.maximum(input1[:min_vr, :min_vc], input2[:min_vr, :min_vc]).astype(dtype, copy=False) + + # Region 2: [0:src0_vr, min_vc:src0_vc] if src0_vc > min_vc + if src0_vc > min_vc: + golden[:src0_vr, min_vc:src0_vc] = input1[:src0_vr, min_vc:src0_vc].copy() + + # Region 3: [min_vr:src1_vr, 0:min_vc] if src1_vr > min_vr + if src1_vr > min_vr: + golden[min_vr:src1_vr, :min_vc] = input2[min_vr:src1_vr, :min_vc].copy() + + # Region 4: [min_vr:src1_vr, min_vc:src1_vc] if src1_vr > min_vr AND src1_vc > min_vc + if src1_vr > min_vr and src1_vc > min_vc: + golden[min_vr:src1_vr, min_vc:src1_vc] = input2[min_vr:src1_vr, min_vc:src1_vc].copy() + + # Region 5: [0:min_vr, src1_vc:src0_vc] if src0_vc > src1_vc + if src0_vc > src1_vc and min_vr > 0: + # Already handled in Region 2 if rows are [0:src0_vr] + pass # Region 2 covers this + + if src1_vr > src0_vr and src0_vc > src1_vc: + # Region [src0_vr:src1_vr, src1_vc:src0_vc] = Min (neither covers) + # This is correct for tpartmax - padding value is Min + # For floats, we use -np.inf. For integers, use dtype min. + if dtype == np.float32: + min_val = np.finfo(np.float32).min + elif dtype == np.float16: + min_val = np.finfo(np.float16).min + elif dtype == np.int8: + min_val = np.iinfo(np.int8).min + elif dtype == np.uint8: + min_val = np.iinfo(np.uint8).min + elif dtype == np.int16: + min_val = np.iinfo(np.int16).min + elif dtype == np.uint16: + min_val = np.iinfo(np.uint16).min + elif dtype == np.int32: + min_val = np.iinfo(np.int32).min + elif dtype == np.uint32: + min_val = np.iinfo(np.uint32).min + else: + min_val = np.iinfo(dtype).min + golden[src0_vr:src1_vr, src1_vc:src0_vc] = min_val + + save_case_data(case["name"], {"input1": input1, "input2": input2, "golden": golden}) + print(f"[INFO] gen_data: {case['name']} shape={shape} src0_valid={src0_valid} src1_valid={src1_valid} dst_valid={dst_valid} dtype={dtype.__name__}") diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tpartmax/launch.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tpartmax/launch.cpp new file mode 100644 index 000000000..c448c25e6 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tpartmax/launch.cpp @@ -0,0 +1,28 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +#include + +#ifndef AICORE +#define AICORE [aicore] +#endif + +// Case: f32 64x64 full + +extern "C" __global__ AICORE void TPARTMAX_f32_64x64_full(__gm__ float *a, __gm__ float *b, __gm__ float *c); +extern "C" __global__ AICORE void TPARTMAX_f32_2x24_src1_col_less(__gm__ float *a, __gm__ float *b, __gm__ float *c); + +void LaunchTPARTMAX_f32_2x24_src1_col_less(float *a, float *b, float *c, void *stream) { + TPARTMAX_f32_2x24_src1_col_less<<<1, nullptr, stream>>>((__gm__ float *)a, (__gm__ float *)b, (__gm__ float *)c); +} + + + +void LaunchTPARTMAX_f32_64x64_full(float *a, float *b, float *c, void *stream) { + TPARTMAX_f32_64x64_full<<<1, nullptr, stream>>>((__gm__ float *)a, (__gm__ float *)b, (__gm__ float *)c); +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tpartmax/main.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tpartmax/main.cpp new file mode 100644 index 000000000..87901b9ec --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tpartmax/main.cpp @@ -0,0 +1,215 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// Host driver for TileLang tpartmax ST — case-table driven. +// Each case launches a different kernel variant, reads/writes from per-case subdirectory. +// Numerical comparison is done externally by compare.py. + +#include "acl/acl.h" +#include "test_common.h" +#include +#include +#include +#include +#include +#include + +using namespace PtoTestCommon; + +// Kernel launch wrappers (defined in launch.cpp) +void LaunchTPARTMAX_f32_64x64_full(float *a, float *b, float *c, void *stream); +void LaunchTPARTMAX_f32_2x24_src1_col_less(float *a, float *b, float *c, void *stream); +void LaunchTPARTMAX_f32_95x95_full(float *a, float *b, float *c, void *stream); +void LaunchTPARTMAX_f16_122x123_complex(uint16_t *a, uint16_t *b, uint16_t *c, void *stream); +void LaunchTPARTMAX_i32_122x123_complex(int32_t *a, int32_t *b, int32_t *c, void *stream); +void LaunchTPARTMAX_u32_122x123_complex(uint32_t *a, uint32_t *b, uint32_t *c, void *stream); +void LaunchTPARTMAX_u8_122x123_complex(uint8_t *a, uint8_t *b, uint8_t *c, void *stream); + +using LaunchFn = void (*)(void *, void *, void *, void *); + +struct TestCase { + const char *name; + LaunchFn launch; + size_t rows; // allocated tile rows + size_t cols; // allocated tile cols (valid cols) + size_t src0ValidRows; // src0 effective rows + size_t src0ValidCols; // src0 effective cols + size_t src1ValidRows; // src1 effective rows + size_t src1ValidCols; // src1 effective cols + size_t dstValidRows; // dst effective rows + size_t dstValidCols; // dst effective cols + size_t elemSize; // bytes per element +}; + +static const TestCase kCases[] = { +{"f32_2x24_src1_col_less", reinterpret_cast(LaunchTPARTMAX_f32_2x24_src1_col_less), 2, 24, 2, 24, 2, 8, 2, 24, sizeof(float)}, +{"f32_64x64_full", reinterpret_cast(LaunchTPARTMAX_f32_64x64_full), 64, 64, 64, 64, 64, 64, 64, 64, sizeof(float)}, +}; +static constexpr size_t kNumCases = sizeof(kCases) / sizeof(kCases[0]); + +// Calculate aligned cols for 32-byte alignment +static size_t CalcAlignedCols(size_t cols, size_t elemSize) { + size_t totalBytes = cols * elemSize; + size_t alignedBytes = ((totalBytes + 31) / 32) * 32; + return alignedBytes / elemSize; +} + +// Helper to pad data with stride +static void PadDataWithStride(const void *src, void *dst, size_t rows, size_t cols, + size_t alignedCols, size_t elemSize) { + const char *srcPtr = static_cast(src); + char *dstPtr = static_cast(dst); + for (size_t r = 0; r < rows; ++r) { + memcpy(dstPtr + r * alignedCols * elemSize, + srcPtr + r * cols * elemSize, + cols * elemSize); + // Zero-fill padding region (optional, data will be overwritten by kernel) + memset(dstPtr + r * alignedCols * elemSize + cols * elemSize, + 0, + (alignedCols - cols) * elemSize); + } +} + +// Helper to unpad data (extract valid cols) +static void UnpadDataWithStride(const void *src, void *dst, size_t rows, size_t cols, + size_t alignedCols, size_t elemSize) { + const char *srcPtr = static_cast(src); + char *dstPtr = static_cast(dst); + for (size_t r = 0; r < rows; ++r) { + memcpy(dstPtr + r * cols * elemSize, + srcPtr + r * alignedCols * elemSize, + cols * elemSize); + } +} + +static int RunCase(const TestCase &tc, int deviceId, aclrtStream stream) { + int rc = 0; + const size_t elemCount = tc.rows * tc.cols; + const size_t fileSize = elemCount * tc.elemSize; + const size_t alignedCols = CalcAlignedCols(tc.cols, tc.elemSize); + const size_t paddedSize = tc.rows * alignedCols * tc.elemSize; + + std::printf("[INFO] === case: %s (shape=%zux%zu, src0_valid=%zux%zu, src1_valid=%zux%zu, dst_valid=%zux%zu, alignedCols=%zu) ===\n", + tc.name, tc.rows, tc.cols, tc.src0ValidRows, tc.src0ValidCols, + tc.src1ValidRows, tc.src1ValidCols, tc.dstValidRows, tc.dstValidCols, alignedCols); + + // Per-case data directory + std::string caseDir = std::string("./") + tc.name; + + void *src0HostOrig = nullptr, *src1HostOrig = nullptr, *dstHostOrig = nullptr; + void *src0Host = nullptr, *src1Host = nullptr, *dstHost = nullptr; + void *src0Device = nullptr, *src1Device = nullptr, *dstDevice = nullptr; + + // Allocate host buffers for original data (contiguous) + aclrtMallocHost((void **)(&src0HostOrig), fileSize); + aclrtMallocHost((void **)(&src1HostOrig), fileSize); + aclrtMallocHost((void **)(&dstHostOrig), fileSize); + + // Allocate host buffers for padded data + aclrtMallocHost((void **)(&src0Host), paddedSize); + aclrtMallocHost((void **)(&src1Host), paddedSize); + aclrtMallocHost((void **)(&dstHost), paddedSize); + + // Allocate device buffers with padded size + aclrtMalloc((void **)&src0Device, paddedSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc((void **)&src1Device, paddedSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc((void **)&dstDevice, paddedSize, ACL_MEM_MALLOC_HUGE_FIRST); + + if (rc == 0) { + size_t src0FileSize = fileSize; + size_t src1FileSize = fileSize; + if (!ReadFile((caseDir + "/input1.bin").c_str(), src0FileSize, src0HostOrig, fileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input1.bin\n", caseDir.c_str()); + rc = 1; + } + if (rc == 0 && !ReadFile((caseDir + "/input2.bin").c_str(), src1FileSize, src1HostOrig, fileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input2.bin\n", caseDir.c_str()); + rc = 1; + } + } + + if (rc == 0) { + // Pad input data with stride + PadDataWithStride(src0HostOrig, src0Host, tc.rows, tc.cols, alignedCols, tc.elemSize); + PadDataWithStride(src1HostOrig, src1Host, tc.rows, tc.cols, alignedCols, tc.elemSize); + + aclrtMemcpy(src0Device, paddedSize, src0Host, paddedSize, ACL_MEMCPY_HOST_TO_DEVICE); + aclrtMemcpy(src1Device, paddedSize, src1Host, paddedSize, ACL_MEMCPY_HOST_TO_DEVICE); + + tc.launch(src0Device, src1Device, dstDevice, stream); + + aclrtSynchronizeStream(stream); + aclrtMemcpy(dstHost, paddedSize, dstDevice, paddedSize, ACL_MEMCPY_DEVICE_TO_HOST); + + // Unpad output data + UnpadDataWithStride(dstHost, dstHostOrig, tc.rows, tc.cols, alignedCols, tc.elemSize); + } + + if (rc == 0 && !WriteFile((caseDir + "/output.bin").c_str(), dstHostOrig, fileSize)) { + std::fprintf(stderr, "[ERROR] failed to write %s/output.bin\n", caseDir.c_str()); + rc = 1; + } + + if (src0Device != nullptr) + aclrtFree(src0Device); + if (src1Device != nullptr) + aclrtFree(src1Device); + if (dstDevice != nullptr) + aclrtFree(dstDevice); + if (src0Host != nullptr) + aclrtFreeHost(src0Host); + if (src1Host != nullptr) + aclrtFreeHost(src1Host); + if (dstHost != nullptr) + aclrtFreeHost(dstHost); + if (src0HostOrig != nullptr) + aclrtFreeHost(src0HostOrig); + if (src1HostOrig != nullptr) + aclrtFreeHost(src1HostOrig); + if (dstHostOrig != nullptr) + aclrtFreeHost(dstHostOrig); + + if (rc == 0) + std::printf("[INFO] case %s done\n", tc.name); + return rc; +} + +int main(int argc, char *argv[]) { + // Optional case filter: ./tpartmax [case_name] + const char *caseFilter = (argc > 1) ? argv[1] : nullptr; + + int rc = 0; + int deviceId = 0; + aclrtStream stream = nullptr; + + aclInit(nullptr); + if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { + deviceId = std::atoi(envDevice); + } + aclrtSetDevice(deviceId); + aclrtCreateStream(&stream); + + for (size_t i = 0; i < kNumCases; ++i) { + if (caseFilter != nullptr && std::strcmp(kCases[i].name, caseFilter) != 0) { + continue; + } + int ret = RunCase(kCases[i], deviceId, stream); + if (ret != 0) { + std::fprintf(stderr, "[ERROR] case %s failed\n", kCases[i].name); + rc = 1; + break; + } + } + + if (stream != nullptr) + aclrtDestroyStream(stream); + aclrtResetDevice(deviceId); + aclFinalize(); + + return rc; +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tpartmax/tpartmax.pto b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tpartmax/tpartmax.pto new file mode 100644 index 000000000..4e8b51816 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tpartmax/tpartmax.pto @@ -0,0 +1,132 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You can not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// TileLang ST kernels for pto.tpartmax: partial elementwise max with valid region handling. +// Multiple cases with different valid_shape combinations in a single module. +// Compiled by ptoas --enable-insert-sync --enable-tile-op-expand --pto-backend=vpto +// to produce a fatobj object. + +module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { + // Case: f32_64x64_full (src0 valid 64x64, src1 valid 64x64, dst valid 64x64) + func.func @TPARTMAX_f32_2x24_src1_col_less(%a_ptr: !pto.ptr, %b_ptr: !pto.ptr, %c_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c2 = arith.constant 2 : index + %c24 = arith.constant 24 : index + %c48 = arith.constant 48 : index + + %a_view = pto.make_tensor_view %a_ptr, + shape = [%c1, %c1, %c1, %c2, %c24], + strides = [%c48, %c48, %c48, %c24, %c1] + : !pto.tensor_view<1x1x1x2x24xf32> + %b_view = pto.make_tensor_view %b_ptr, + shape = [%c1, %c1, %c1, %c2, %c24], + strides = [%c48, %c48, %c48, %c24, %c1] + : !pto.tensor_view<1x1x1x2x24xf32> + %c_view = pto.make_tensor_view %c_ptr, + shape = [%c1, %c1, %c1, %c2, %c24], + strides = [%c48, %c48, %c48, %c24, %c1] + : !pto.tensor_view<1x1x1x2x24xf32> + + %a_part = pto.partition_view %a_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c2, %c24] + : !pto.tensor_view<1x1x1x2x24xf32> -> !pto.partition_tensor_view<1x1x1x2x24xf32> + %b_part = pto.partition_view %b_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c2, %c24] + : !pto.tensor_view<1x1x1x2x24xf32> -> !pto.partition_tensor_view<1x1x1x2x24xf32> + %c_part = pto.partition_view %c_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c2, %c24] + : !pto.tensor_view<1x1x1x2x24xf32> -> !pto.partition_tensor_view<1x1x1x2x24xf32> + + // src0: valid region (2,24) + %a = pto.alloc_tile + : !pto.tile_buf + // src1: valid region (2,8) + %b = pto.alloc_tile + : !pto.tile_buf + // dst: valid region (2,24) + %c = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%a_part : !pto.partition_tensor_view<1x1x1x2x24xf32>) + outs(%a : !pto.tile_buf) + pto.tload ins(%b_part : !pto.partition_tensor_view<1x1x1x2x24xf32>) + outs(%b : !pto.tile_buf) + + pto.tpartmax ins(%a, %b : !pto.tile_buf, + !pto.tile_buf) + outs(%c : !pto.tile_buf) + + pto.tstore ins(%c : !pto.tile_buf) + outs(%c_part : !pto.partition_tensor_view<1x1x1x2x24xf32>) + return + } + + // Case: f32_128x64_src1_row_less (src0 valid 128x64, src1 valid 96x64, dst valid 128x64) + + func.func @TPARTMAX_f32_64x64_full(%a_ptr: !pto.ptr, %b_ptr: !pto.ptr, %c_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c64 = arith.constant 64 : index + %c4096 = arith.constant 4096 : index + + %a_view = pto.make_tensor_view %a_ptr, + shape = [%c1, %c1, %c1, %c64, %c64], + strides = [%c4096, %c4096, %c4096, %c64, %c1] + : !pto.tensor_view<1x1x1x64x64xf32> + %b_view = pto.make_tensor_view %b_ptr, + shape = [%c1, %c1, %c1, %c64, %c64], + strides = [%c4096, %c4096, %c4096, %c64, %c1] + : !pto.tensor_view<1x1x1x64x64xf32> + %c_view = pto.make_tensor_view %c_ptr, + shape = [%c1, %c1, %c1, %c64, %c64], + strides = [%c4096, %c4096, %c4096, %c64, %c1] + : !pto.tensor_view<1x1x1x64x64xf32> + + %a_part = pto.partition_view %a_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c64, %c64] + : !pto.tensor_view<1x1x1x64x64xf32> -> !pto.partition_tensor_view<1x1x1x64x64xf32> + %b_part = pto.partition_view %b_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c64, %c64] + : !pto.tensor_view<1x1x1x64x64xf32> -> !pto.partition_tensor_view<1x1x1x64x64xf32> + %c_part = pto.partition_view %c_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c64, %c64] + : !pto.tensor_view<1x1x1x64x64xf32> -> !pto.partition_tensor_view<1x1x1x64x64xf32> + + // src0: valid region (64,64) + %a = pto.alloc_tile + : !pto.tile_buf + // src1: valid region (64,64) + %b = pto.alloc_tile + : !pto.tile_buf + // dst: valid region (64,64) + %c = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%a_part : !pto.partition_tensor_view<1x1x1x64x64xf32>) + outs(%a : !pto.tile_buf) + pto.tload ins(%b_part : !pto.partition_tensor_view<1x1x1x64x64xf32>) + outs(%b : !pto.tile_buf) + + pto.tpartmax ins(%a, %b : !pto.tile_buf, + !pto.tile_buf) + outs(%c : !pto.tile_buf) + + pto.tstore ins(%c : !pto.tile_buf) + outs(%c_part : !pto.partition_tensor_view<1x1x1x64x64xf32>) + return + } + + // Case: f32_2x24_src1_col_less (src0 valid 2x24, src1 valid 2x8, dst valid 2x24) +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tpartmin/CMakeLists.txt b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tpartmin/CMakeLists.txt new file mode 100644 index 000000000..e1cc35877 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tpartmin/CMakeLists.txt @@ -0,0 +1,9 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +pto_tilelang_vec_st(tpartmin) diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tpartmin/cases.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tpartmin/cases.py new file mode 100644 index 000000000..d570b8a8a --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tpartmin/cases.py @@ -0,0 +1,160 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +"""Single source of truth for tpartmin ST test cases. + +Each case defines: + - name: case identifier, used as subdirectory name and by main.cpp kCases[]. + - dtype: numpy dtype (e.g. np.float32). + - shape: (rows, cols) — allocated tile dimensions (same for src0/src1/dst). + - valid_shape: (valid_rows, valid_cols) — src0 valid region (src0_eq_dst scenario). + - src1_vshape: (src1_valid_rows, src1_valid_cols) — src1 valid region. + May be smaller than dst valid region for partial min cases. + - dst_vshape: (dst_valid_rows, dst_valid_cols) — dst valid region. + - eps: tolerance for numpy.allclose (atol and rtol). + +tpartmin semantics: + - If src0_valid == dst_valid: dst[:src1_rows,:src1_cols] = min(src0[:src1_rows,:src1_cols], src1[:src1_rows,:src1_cols]) + dst[src1_rows:,:] = src0[src1_rows:,:] (copy remaining rows) + OR (for col_less) dst[:,:src1_cols] = min(src0[:,:src1_cols], src1[:,:src1_cols]) + dst[:,src1_cols:] = src0[:,src1_cols:] (copy remaining cols) + - If src1_valid == dst_valid: similar logic with src1 as the full operand. + +gen_data.py and compare.py both import this list to avoid redundant definitions. +""" + +import numpy as np + +CASES = [ + { + "name": "f32_2x24_src1_col_less", + "dtype": np.float32, + "shape": (2, 24), + "valid_shape": (2, 24), # src0 valid region (equals dst) + "src1_vshape": (2, 8), # src1 valid region (col_less) + "dst_vshape": (2, 24), # dst valid region + "eps": 1e-6, + }, + # float32 cases from pto-isa + { + "name": "f32_64x64_full", + "dtype": np.float32, + "shape": (64, 64), + "valid_shape": (64, 64), # src0 valid region + "src1_vshape": (64, 64), # src1 valid region (same as dst) + "dst_vshape": (64, 64), # dst valid region + "eps": 1e-6, + }, + { + "name": "f32_128x64_src1_row_less", + "dtype": np.float32, + "shape": (128, 64), + "valid_shape": (128, 64), # src0 valid region (equals dst) + "src1_vshape": (96, 64), # src1 valid region (row_less) + "dst_vshape": (128, 64), # dst valid region + "eps": 1e-6, + }, + { + "name": "f32_95x95_full", + "dtype": np.float32, + "shape": (95, 95), + "valid_shape": (95, 95), # src0 valid region + "src1_vshape": (95, 95), # src1 valid region (same as dst) + "dst_vshape": (95, 95), # dst valid region + "eps": 1e-6, + }, + { + "name": "f32_122x123_complex", + "dtype": np.float32, + "shape": (122, 123), + "valid_shape": (104, 123), # src0 valid region + "src1_vshape": (122, 110), # src1 valid region + "dst_vshape": (122, 123), # dst valid region (src1 rows, src0 cols) + "eps": 1e-6, + }, + # float16 cases from pto-isa + { + "name": "f16_122x123_complex", + "dtype": np.float16, + "shape": (122, 123), + "valid_shape": (104, 123), # src0 valid region + "src1_vshape": (122, 110), # src1 valid region + "dst_vshape": (122, 123), # dst valid region + "eps": 1e-3, + }, + # int16 cases from pto-isa + { + "name": "i16_122x123_complex", + "dtype": np.int16, + "shape": (122, 123), + "valid_shape": (104, 123), # src0 valid region + "src1_vshape": (122, 110), # src1 valid region + "dst_vshape": (122, 123), # dst valid region + "eps": 0, + }, + # int32 cases from pto-isa + { + "name": "i32_122x123_complex", + "dtype": np.int32, + "shape": (122, 123), + "valid_shape": (104, 123), # src0 valid region + "src1_vshape": (122, 110), # src1 valid region + "dst_vshape": (122, 123), # dst valid region + "eps": 0, + }, + # uint16 cases from pto-isa + { + "name": "u16_122x123_complex", + "dtype": np.uint16, + "shape": (122, 123), + "valid_shape": (104, 123), # src0 valid region + "src1_vshape": (122, 110), # src1 valid region + "dst_vshape": (122, 123), # dst valid region + "eps": 0, + }, + # uint32 cases from pto-isa + { + "name": "u32_122x123_complex", + "dtype": np.uint32, + "shape": (122, 123), + "valid_shape": (104, 123), # src0 valid region + "src1_vshape": (122, 110), # src1 valid region + "dst_vshape": (122, 123), # dst valid region + "eps": 0, + }, + # int8 cases from pto-isa + { + "name": "i8_122x123_complex", + "dtype": np.int8, + "shape": (122, 123), + "valid_shape": (104, 123), # src0 valid region + "src1_vshape": (122, 110), # src1 valid region + "dst_vshape": (122, 123), # dst valid region + "eps": 0, + }, + # uint8 cases from pto-isa + { + "name": "u8_122x123_complex", + "dtype": np.uint8, + "shape": (122, 123), + "valid_shape": (104, 123), # src0 valid region + "src1_vshape": (122, 110), # src1 valid region + "dst_vshape": (122, 123), # dst valid region + "eps": 0, + }, +] + +_SMOKE_CASE_NAMES = ['f32_2x24_src1_col_less', 'f32_64x64_full'] +_SMOKE_CASE_NAME_SET = set(_SMOKE_CASE_NAMES) +_missing = [name for name in _SMOKE_CASE_NAMES if name not in {case["name"] for case in CASES}] +if _missing: + raise RuntimeError("unknown smoke case(s): " + ", ".join(_missing)) +CASES = [case for case in CASES if case["name"] in _SMOKE_CASE_NAME_SET] diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tpartmin/compare.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tpartmin/compare.py new file mode 100644 index 000000000..296e28e22 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tpartmin/compare.py @@ -0,0 +1,53 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import os +import sys +import numpy as np + +# Add parent directory to path for st_common import +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +from st_common import result_cmp, style_fail, style_pass + +from cases import CASES + + +def main(): + case_filter = sys.argv[1] if len(sys.argv) > 1 else None + + all_passed = True + for case in CASES: + if case_filter is not None and case["name"] != case_filter: + continue + + case_dir = case["name"] + shape = case["shape"] + dtype = case["dtype"] + dst_vr, dst_vc = case["dst_vshape"] + + golden = np.fromfile(os.path.join(case_dir, "golden.bin"), dtype=dtype).reshape(shape) + output = np.fromfile(os.path.join(case_dir, "output.bin"), dtype=dtype).reshape(shape) + + # Compare only the dst valid region + ok = result_cmp(golden[:dst_vr, :dst_vc], output[:dst_vr, :dst_vc], case["eps"]) + if ok: + print(style_pass(f"[INFO] {case['name']}: compare passed")) + else: + print(style_fail(f"[ERROR] {case['name']}: compare failed")) + all_passed = False + + if not all_passed: + sys.exit(2) + print(style_pass("[INFO] all cases passed")) + + +if __name__ == "__main__": + main() diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tpartmin/gen_data.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tpartmin/gen_data.py new file mode 100644 index 000000000..2dcb2866d --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tpartmin/gen_data.py @@ -0,0 +1,127 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import numpy as np +import os +import sys + +# Add parent directory to path for st_common import +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +from st_common import setup_case_rng, save_case_data + +from cases import CASES + + +def _to_tuple(shape): + """Convert shape to tuple if needed.""" + if isinstance(shape, tuple): + return shape + return tuple(shape) + + +for case in CASES: + setup_case_rng(case) + + dtype = case["dtype"] + shape = _to_tuple(case["shape"]) + src0_valid = _to_tuple(case["valid_shape"]) + src1_valid = _to_tuple(case["src1_vshape"]) + dst_valid = _to_tuple(case["dst_vshape"]) + + rows, cols = shape + src0_vr, src0_vc = src0_valid + src1_vr, src1_vc = src1_valid + dst_vr, dst_vc = dst_valid + + input1 = np.random.randint(1, 10, size=shape).astype(dtype) + input2 = np.random.randint(1, 10, size=shape).astype(dtype) + + golden = np.zeros(shape, dtype=dtype) + + # tpartmin semantics (based on pto-isa TPartBinOps.hpp TCopyPadOp): + # Algorithm: + # 1. dst[:] = Max (padding for min operation) + # 2. dst[0:src0_vr, 0:src0_vc] = src0[0:src0_vr, 0:src0_vc] (copy src0 to dst) + # 3. dst[0:src1_vr, 0:src1_vc] = min(dst[0:src1_vr, 0:src1_vc], src1[0:src1_vr, 0:src1_vc]) + # (apply min in src1 valid region) + + src0_eq_dst = (src0_vr == dst_vr and src0_vc == dst_vc) + src1_eq_dst = (src1_vr == dst_vr and src1_vc == dst_vc) + + if src0_eq_dst and src1_eq_dst: + # Full min: both src0 and src1 cover entire dst + golden[:dst_vr, :dst_vc] = np.minimum(input1[:dst_vr, :dst_vc], input2[:dst_vr, :dst_vc]).astype(dtype, copy=False) + elif src0_eq_dst: + # src0 covers dst, src1 is partial + # dst = src0 (copy), then min(dst, src1) in src1 region = min(src0, src1) in src1 region, src0 in rest + golden[:src1_vr, :src1_vc] = np.minimum(input1[:src1_vr, :src1_vc], input2[:src1_vr, :src1_vc]).astype(dtype, copy=False) + if src1_vc < dst_vc: + golden[:src1_vr, src1_vc:dst_vc] = input1[:src1_vr, src1_vc:dst_vc].copy() + if src1_vr < dst_vr: + golden[src1_vr:dst_vr, :dst_vc] = input1[src1_vr:dst_vr, :dst_vc].copy() + elif src1_eq_dst: + # src1 covers dst, src0 is partial + # dst = Max, then copy src0 in src0 region, then min(dst, src1) in src1 region + golden[:src0_vr, :src0_vc] = np.minimum(input1[:src0_vr, :src0_vc], input2[:src0_vr, :src0_vc]).astype(dtype, copy=False) + if src0_vc < dst_vc: + golden[:src0_vr, src0_vc:dst_vc] = input2[:src0_vr, src0_vc:dst_vc].copy() + if src0_vr < dst_vr: + golden[src0_vr:dst_vr, :dst_vc] = input2[src0_vr:dst_vr, :dst_vc].copy() + else: + min_vr = min(src0_vr, src1_vr) + min_vc = min(src0_vc, src1_vc) + + # Region 1: [0:min_vr, 0:min_vc] - overlapping region (both src0 and src1 valid) + golden[:min_vr, :min_vc] = np.minimum(input1[:min_vr, :min_vc], input2[:min_vr, :min_vc]).astype(dtype, copy=False) + + # Region 2: [0:src0_vr, min_vc:src0_vc] if src0_vc > min_vc + if src0_vc > min_vc: + golden[:src0_vr, min_vc:src0_vc] = input1[:src0_vr, min_vc:src0_vc].copy() + + # Region 3: [min_vr:src1_vr, 0:min_vc] if src1_vr > min_vr + if src1_vr > min_vr: + golden[min_vr:src1_vr, :min_vc] = input2[min_vr:src1_vr, :min_vc].copy() + + # Region 4: [min_vr:src1_vr, min_vc:src1_vc] if src1_vr > min_vr AND src1_vc > min_vc + if src1_vr > min_vr and src1_vc > min_vc: + golden[min_vr:src1_vr, min_vc:src1_vc] = input2[min_vr:src1_vr, min_vc:src1_vc].copy() + + # Region 5: [0:min_vr, src1_vc:src0_vc] if src0_vc > src1_vc + if src0_vc > src1_vc and min_vr > 0: + # Already handled in Region 2 if rows are [0:src0_vr] + pass # Region 2 covers this + + if src1_vr > src0_vr and src0_vc > src1_vc: + # Region [src0_vr:src1_vr, src1_vc:src0_vc] = Max (neither covers) + # This is correct for tpartmin - padding value is Max + # For floats, we use np.inf. For integers, use dtype max. + if dtype == np.float32: + max_val = np.finfo(np.float32).max + elif dtype == np.float16: + max_val = np.finfo(np.float16).max + elif dtype == np.int8: + max_val = np.iinfo(np.int8).max + elif dtype == np.uint8: + max_val = np.iinfo(np.uint8).max + elif dtype == np.int16: + max_val = np.iinfo(np.int16).max + elif dtype == np.uint16: + max_val = np.iinfo(np.uint16).max + elif dtype == np.int32: + max_val = np.iinfo(np.int32).max + elif dtype == np.uint32: + max_val = np.iinfo(np.uint32).max + else: + max_val = np.iinfo(dtype).max + golden[src0_vr:src1_vr, src1_vc:src0_vc] = max_val + + save_case_data(case["name"], {"input1": input1, "input2": input2, "golden": golden}) + print(f"[INFO] gen_data: {case['name']} shape={shape} src0_valid={src0_valid} src1_valid={src1_valid} dst_valid={dst_valid} dtype={dtype.__name__}") diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tpartmin/launch.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tpartmin/launch.cpp new file mode 100644 index 000000000..cf5752e0d --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tpartmin/launch.cpp @@ -0,0 +1,28 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +#include + +#ifndef AICORE +#define AICORE [aicore] +#endif + +// Case: f32 64x64 full + +extern "C" __global__ AICORE void TPARTMIN_f32_64x64_full(__gm__ float *a, __gm__ float *b, __gm__ float *c); +extern "C" __global__ AICORE void TPARTMIN_f32_2x24_src1_col_less(__gm__ float *a, __gm__ float *b, __gm__ float *c); + +void LaunchTPARTMIN_f32_64x64_full(float *a, float *b, float *c, void *stream) { + TPARTMIN_f32_64x64_full<<<1, nullptr, stream>>>((__gm__ float *)a, (__gm__ float *)b, (__gm__ float *)c); +} + + + +void LaunchTPARTMIN_f32_2x24_src1_col_less(float *a, float *b, float *c, void *stream) { + TPARTMIN_f32_2x24_src1_col_less<<<1, nullptr, stream>>>((__gm__ float *)a, (__gm__ float *)b, (__gm__ float *)c); +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tpartmin/main.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tpartmin/main.cpp new file mode 100644 index 000000000..ff068c9a4 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tpartmin/main.cpp @@ -0,0 +1,215 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// Host driver for TileLang tpartmin ST — case-table driven. +// Each case launches a different kernel variant, reads/writes from per-case subdirectory. +// Numerical comparison is done externally by compare.py. + +#include "acl/acl.h" +#include "test_common.h" +#include +#include +#include +#include +#include +#include + +using namespace PtoTestCommon; + +// Kernel launch wrappers (defined in launch.cpp) +void LaunchTPARTMIN_f32_64x64_full(float *a, float *b, float *c, void *stream); +void LaunchTPARTMIN_f32_2x24_src1_col_less(float *a, float *b, float *c, void *stream); +void LaunchTPARTMIN_f32_95x95_full(float *a, float *b, float *c, void *stream); +void LaunchTPARTMIN_f16_122x123_complex(uint16_t *a, uint16_t *b, uint16_t *c, void *stream); +void LaunchTPARTMIN_i32_122x123_complex(int32_t *a, int32_t *b, int32_t *c, void *stream); +void LaunchTPARTMIN_u32_122x123_complex(uint32_t *a, uint32_t *b, uint32_t *c, void *stream); +void LaunchTPARTMIN_u8_122x123_complex(uint8_t *a, uint8_t *b, uint8_t *c, void *stream); + +using LaunchFn = void (*)(void *, void *, void *, void *); + +struct TestCase { + const char *name; + LaunchFn launch; + size_t rows; // allocated tile rows + size_t cols; // allocated tile cols (valid cols) + size_t src0ValidRows; // src0 effective rows + size_t src0ValidCols; // src0 effective cols + size_t src1ValidRows; // src1 effective rows + size_t src1ValidCols; // src1 effective cols + size_t dstValidRows; // dst effective rows + size_t dstValidCols; // dst effective cols + size_t elemSize; // bytes per element +}; + +static const TestCase kCases[] = { +{"f32_2x24_src1_col_less", reinterpret_cast(LaunchTPARTMIN_f32_2x24_src1_col_less), 2, 24, 2, 24, 2, 8, 2, 24, sizeof(float)}, +{"f32_64x64_full", reinterpret_cast(LaunchTPARTMIN_f32_64x64_full), 64, 64, 64, 64, 64, 64, 64, 64, sizeof(float)}, +}; +static constexpr size_t kNumCases = sizeof(kCases) / sizeof(kCases[0]); + +// Calculate aligned cols for 32-byte alignment +static size_t CalcAlignedCols(size_t cols, size_t elemSize) { + size_t totalBytes = cols * elemSize; + size_t alignedBytes = ((totalBytes + 31) / 32) * 32; + return alignedBytes / elemSize; +} + +// Helper to pad data with stride +static void PadDataWithStride(const void *src, void *dst, size_t rows, size_t cols, + size_t alignedCols, size_t elemSize) { + const char *srcPtr = static_cast(src); + char *dstPtr = static_cast(dst); + for (size_t r = 0; r < rows; ++r) { + memcpy(dstPtr + r * alignedCols * elemSize, + srcPtr + r * cols * elemSize, + cols * elemSize); + // Zero-fill padding region (optional, data will be overwritten by kernel) + memset(dstPtr + r * alignedCols * elemSize + cols * elemSize, + 0, + (alignedCols - cols) * elemSize); + } +} + +// Helper to unpad data (extract valid cols) +static void UnpadDataWithStride(const void *src, void *dst, size_t rows, size_t cols, + size_t alignedCols, size_t elemSize) { + const char *srcPtr = static_cast(src); + char *dstPtr = static_cast(dst); + for (size_t r = 0; r < rows; ++r) { + memcpy(dstPtr + r * cols * elemSize, + srcPtr + r * alignedCols * elemSize, + cols * elemSize); + } +} + +static int RunCase(const TestCase &tc, int deviceId, aclrtStream stream) { + int rc = 0; + const size_t elemCount = tc.rows * tc.cols; + const size_t fileSize = elemCount * tc.elemSize; + const size_t alignedCols = CalcAlignedCols(tc.cols, tc.elemSize); + const size_t paddedSize = tc.rows * alignedCols * tc.elemSize; + + std::printf("[INFO] === case: %s (shape=%zux%zu, src0_valid=%zux%zu, src1_valid=%zux%zu, dst_valid=%zux%zu, alignedCols=%zu) ===\n", + tc.name, tc.rows, tc.cols, tc.src0ValidRows, tc.src0ValidCols, + tc.src1ValidRows, tc.src1ValidCols, tc.dstValidRows, tc.dstValidCols, alignedCols); + + // Per-case data directory + std::string caseDir = std::string("./") + tc.name; + + void *src0HostOrig = nullptr, *src1HostOrig = nullptr, *dstHostOrig = nullptr; + void *src0Host = nullptr, *src1Host = nullptr, *dstHost = nullptr; + void *src0Device = nullptr, *src1Device = nullptr, *dstDevice = nullptr; + + // Allocate host buffers for original data (contiguous) + aclrtMallocHost((void **)(&src0HostOrig), fileSize); + aclrtMallocHost((void **)(&src1HostOrig), fileSize); + aclrtMallocHost((void **)(&dstHostOrig), fileSize); + + // Allocate host buffers for padded data + aclrtMallocHost((void **)(&src0Host), paddedSize); + aclrtMallocHost((void **)(&src1Host), paddedSize); + aclrtMallocHost((void **)(&dstHost), paddedSize); + + // Allocate device buffers with padded size + aclrtMalloc((void **)&src0Device, paddedSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc((void **)&src1Device, paddedSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc((void **)&dstDevice, paddedSize, ACL_MEM_MALLOC_HUGE_FIRST); + + if (rc == 0) { + size_t src0FileSize = fileSize; + size_t src1FileSize = fileSize; + if (!ReadFile((caseDir + "/input1.bin").c_str(), src0FileSize, src0HostOrig, fileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input1.bin\n", caseDir.c_str()); + rc = 1; + } + if (rc == 0 && !ReadFile((caseDir + "/input2.bin").c_str(), src1FileSize, src1HostOrig, fileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input2.bin\n", caseDir.c_str()); + rc = 1; + } + } + + if (rc == 0) { + // Pad input data with stride + PadDataWithStride(src0HostOrig, src0Host, tc.rows, tc.cols, alignedCols, tc.elemSize); + PadDataWithStride(src1HostOrig, src1Host, tc.rows, tc.cols, alignedCols, tc.elemSize); + + aclrtMemcpy(src0Device, paddedSize, src0Host, paddedSize, ACL_MEMCPY_HOST_TO_DEVICE); + aclrtMemcpy(src1Device, paddedSize, src1Host, paddedSize, ACL_MEMCPY_HOST_TO_DEVICE); + + tc.launch(src0Device, src1Device, dstDevice, stream); + + aclrtSynchronizeStream(stream); + aclrtMemcpy(dstHost, paddedSize, dstDevice, paddedSize, ACL_MEMCPY_DEVICE_TO_HOST); + + // Unpad output data + UnpadDataWithStride(dstHost, dstHostOrig, tc.rows, tc.cols, alignedCols, tc.elemSize); + } + + if (rc == 0 && !WriteFile((caseDir + "/output.bin").c_str(), dstHostOrig, fileSize)) { + std::fprintf(stderr, "[ERROR] failed to write %s/output.bin\n", caseDir.c_str()); + rc = 1; + } + + if (src0Device != nullptr) + aclrtFree(src0Device); + if (src1Device != nullptr) + aclrtFree(src1Device); + if (dstDevice != nullptr) + aclrtFree(dstDevice); + if (src0Host != nullptr) + aclrtFreeHost(src0Host); + if (src1Host != nullptr) + aclrtFreeHost(src1Host); + if (dstHost != nullptr) + aclrtFreeHost(dstHost); + if (src0HostOrig != nullptr) + aclrtFreeHost(src0HostOrig); + if (src1HostOrig != nullptr) + aclrtFreeHost(src1HostOrig); + if (dstHostOrig != nullptr) + aclrtFreeHost(dstHostOrig); + + if (rc == 0) + std::printf("[INFO] case %s done\n", tc.name); + return rc; +} + +int main(int argc, char *argv[]) { + // Optional case filter: ./tpartmin [case_name] + const char *caseFilter = (argc > 1) ? argv[1] : nullptr; + + int rc = 0; + int deviceId = 0; + aclrtStream stream = nullptr; + + aclInit(nullptr); + if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { + deviceId = std::atoi(envDevice); + } + aclrtSetDevice(deviceId); + aclrtCreateStream(&stream); + + for (size_t i = 0; i < kNumCases; ++i) { + if (caseFilter != nullptr && std::strcmp(kCases[i].name, caseFilter) != 0) { + continue; + } + int ret = RunCase(kCases[i], deviceId, stream); + if (ret != 0) { + std::fprintf(stderr, "[ERROR] case %s failed\n", kCases[i].name); + rc = 1; + break; + } + } + + if (stream != nullptr) + aclrtDestroyStream(stream); + aclrtResetDevice(deviceId); + aclFinalize(); + + return rc; +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tpartmin/tpartmin.pto b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tpartmin/tpartmin.pto new file mode 100644 index 000000000..54ceb5e27 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tpartmin/tpartmin.pto @@ -0,0 +1,132 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You can not use the file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// TileLang ST kernels for pto.tpartmin: partial elementwise min with valid region handling. +// Multiple cases with different valid_shape combinations in a single module. +// Compiled by ptoas --enable-insert-sync --enable-tile-op-expand --pto-backend=vpto +// to produce a fatobj object. + +module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { + // Case: f32_64x64_full (src0 valid 64x64, src1 valid 64x64, dst valid 64x64) + func.func @TPARTMIN_f32_2x24_src1_col_less(%a_ptr: !pto.ptr, %b_ptr: !pto.ptr, %c_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c2 = arith.constant 2 : index + %c24 = arith.constant 24 : index + %c48 = arith.constant 48 : index + + %a_view = pto.make_tensor_view %a_ptr, + shape = [%c1, %c1, %c1, %c2, %c24], + strides = [%c48, %c48, %c48, %c24, %c1] + : !pto.tensor_view<1x1x1x2x24xf32> + %b_view = pto.make_tensor_view %b_ptr, + shape = [%c1, %c1, %c1, %c2, %c24], + strides = [%c48, %c48, %c48, %c24, %c1] + : !pto.tensor_view<1x1x1x2x24xf32> + %c_view = pto.make_tensor_view %c_ptr, + shape = [%c1, %c1, %c1, %c2, %c24], + strides = [%c48, %c48, %c48, %c24, %c1] + : !pto.tensor_view<1x1x1x2x24xf32> + + %a_part = pto.partition_view %a_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c2, %c24] + : !pto.tensor_view<1x1x1x2x24xf32> -> !pto.partition_tensor_view<1x1x1x2x24xf32> + %b_part = pto.partition_view %b_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c2, %c24] + : !pto.tensor_view<1x1x1x2x24xf32> -> !pto.partition_tensor_view<1x1x1x2x24xf32> + %c_part = pto.partition_view %c_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c2, %c24] + : !pto.tensor_view<1x1x1x2x24xf32> -> !pto.partition_tensor_view<1x1x1x2x24xf32> + + // src0: valid region (2,24) + %a = pto.alloc_tile + : !pto.tile_buf + // src1: valid region (2,8) + %b = pto.alloc_tile + : !pto.tile_buf + // dst: valid region (2,24) + %c = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%a_part : !pto.partition_tensor_view<1x1x1x2x24xf32>) + outs(%a : !pto.tile_buf) + pto.tload ins(%b_part : !pto.partition_tensor_view<1x1x1x2x24xf32>) + outs(%b : !pto.tile_buf) + + pto.tpartmin ins(%a, %b : !pto.tile_buf, + !pto.tile_buf) + outs(%c : !pto.tile_buf) + + pto.tstore ins(%c : !pto.tile_buf) + outs(%c_part : !pto.partition_tensor_view<1x1x1x2x24xf32>) + return + } + + // Case: f32_128x64_src1_row_less (src0 valid 128x64, src1 valid 96x64, dst valid 128x64) + + func.func @TPARTMIN_f32_64x64_full(%a_ptr: !pto.ptr, %b_ptr: !pto.ptr, %c_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c64 = arith.constant 64 : index + %c4096 = arith.constant 4096 : index + + %a_view = pto.make_tensor_view %a_ptr, + shape = [%c1, %c1, %c1, %c64, %c64], + strides = [%c4096, %c4096, %c4096, %c64, %c1] + : !pto.tensor_view<1x1x1x64x64xf32> + %b_view = pto.make_tensor_view %b_ptr, + shape = [%c1, %c1, %c1, %c64, %c64], + strides = [%c4096, %c4096, %c4096, %c64, %c1] + : !pto.tensor_view<1x1x1x64x64xf32> + %c_view = pto.make_tensor_view %c_ptr, + shape = [%c1, %c1, %c1, %c64, %c64], + strides = [%c4096, %c4096, %c4096, %c64, %c1] + : !pto.tensor_view<1x1x1x64x64xf32> + + %a_part = pto.partition_view %a_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c64, %c64] + : !pto.tensor_view<1x1x1x64x64xf32> -> !pto.partition_tensor_view<1x1x1x64x64xf32> + %b_part = pto.partition_view %b_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c64, %c64] + : !pto.tensor_view<1x1x1x64x64xf32> -> !pto.partition_tensor_view<1x1x1x64x64xf32> + %c_part = pto.partition_view %c_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c64, %c64] + : !pto.tensor_view<1x1x1x64x64xf32> -> !pto.partition_tensor_view<1x1x1x64x64xf32> + + // src0: valid region (64,64) + %a = pto.alloc_tile + : !pto.tile_buf + // src1: valid region (64,64) + %b = pto.alloc_tile + : !pto.tile_buf + // dst: valid region (64,64) + %c = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%a_part : !pto.partition_tensor_view<1x1x1x64x64xf32>) + outs(%a : !pto.tile_buf) + pto.tload ins(%b_part : !pto.partition_tensor_view<1x1x1x64x64xf32>) + outs(%b : !pto.tile_buf) + + pto.tpartmin ins(%a, %b : !pto.tile_buf, + !pto.tile_buf) + outs(%c : !pto.tile_buf) + + pto.tstore ins(%c : !pto.tile_buf) + outs(%c_part : !pto.partition_tensor_view<1x1x1x64x64xf32>) + return + } + + // Case: f32_2x24_src1_col_less (src0 valid 2x24, src1 valid 2x8, dst valid 2x24) +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tpartmul/CMakeLists.txt b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tpartmul/CMakeLists.txt new file mode 100644 index 000000000..d64c803dd --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tpartmul/CMakeLists.txt @@ -0,0 +1,9 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +pto_tilelang_vec_st(tpartmul) diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tpartmul/cases.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tpartmul/cases.py new file mode 100644 index 000000000..a6c5f6e01 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tpartmul/cases.py @@ -0,0 +1,129 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +"""Single source of truth for tpartmul ST test cases. + +Each case defines: + - name: case identifier, used as subdirectory name and by main.cpp kCases[]. + - dtype: numpy dtype (e.g. np.float32). + - shape: (rows, cols) — allocated tile dimensions (same for src0/src1/dst). + - valid_shape: (valid_rows, valid_cols) — src0 valid region (src0_eq_dst scenario). + - src1_vshape: (src1_valid_rows, src1_valid_cols) — src1 valid region. + May be smaller than dst valid region for partial mul cases. + - dst_vshape: (dst_valid_rows, dst_valid_cols) — dst valid region. + - eps: tolerance for numpy.allclose (atol and rtol). + +tpartmul semantics: + - If src0_valid == dst_valid: dst[:src1_rows,:src1_cols] = src0[:src1_rows,:src1_cols] * src1[:src1_rows,:src1_cols] + dst[src1_rows:,:] = src0[src1_rows:,:] (copy remaining rows) + OR (for col_less) dst[:,:src1_cols] = src0[:,:src1_cols] * src1[:,:src1_cols] + dst[:,src1_cols:] = src0[:,src1_cols:] (copy remaining cols) + - If src1_valid == dst_valid: similar logic with src1 as the full operand. + +gen_data.py and compare.py both import this list to avoid redundant definitions. +""" + +import numpy as np + +CASES = [ + # float16 cases + { + "name": "f16_8x48_src0_col_less", + "dtype": np.float16, + "shape": (8, 48), + "valid_shape": (8, 16), # src0 valid region (col_less) + "src1_vshape": (8, 48), # src1 valid region (equals dst) + "dst_vshape": (8, 48), # dst valid region + "eps": 1e-3, + }, + # float32 cases + { + "name": "f32_64x64_full", + "dtype": np.float32, + "shape": (64, 64), + "valid_shape": (64, 64), # src0 valid region + "src1_vshape": (64, 64), # src1 valid region (same as dst) + "dst_vshape": (64, 64), # dst valid region + "eps": 1e-6, + }, + { + "name": "f32_64x64_src0_row_less", + "dtype": np.float32, + "shape": (64, 64), + "valid_shape": (8, 64), # src0 valid region (row_less) + "src1_vshape": (64, 64), # src1 valid region (equals dst) + "dst_vshape": (64, 64), # dst valid region + "eps": 1e-6, + }, + { + "name": "f32_64x64_src0_col_less", + "dtype": np.float32, + "shape": (64, 64), + "valid_shape": (64, 8), # src0 valid region (col_less) + "src1_vshape": (64, 64), # src1 valid region (equals dst) + "dst_vshape": (64, 64), # dst valid region + "eps": 1e-6, + }, + { + "name": "f32_64x64_src1_row_less", + "dtype": np.float32, + "shape": (64, 64), + "valid_shape": (64, 64), # src0 valid region (equals dst) + "src1_vshape": (8, 64), # src1 valid region (row_less) + "dst_vshape": (64, 64), # dst valid region + "eps": 1e-6, + }, + { + "name": "f32_64x64_src1_col_less", + "dtype": np.float32, + "shape": (64, 64), + "valid_shape": (64, 64), # src0 valid region (equals dst) + "src1_vshape": (64, 8), # src1 valid region (col_less) + "dst_vshape": (64, 64), # dst valid region + "eps": 1e-6, + }, + { + "name": "f16_8x768_src0_col_less", + "dtype": np.float16, + "shape": (8, 768), + "valid_shape": (8, 512), # src0 valid region (col_less) + "src1_vshape": (8, 768), # src1 valid region (equals dst) + "dst_vshape": (8, 768), # dst valid region + "eps": 1e-3, + }, + # int16 cases + { + "name": "i16_8x48_src1_col_less", + "dtype": np.int16, + "shape": (8, 48), + "valid_shape": (8, 48), # src0 valid region (equals dst) + "src1_vshape": (8, 16), # src1 valid region (col_less) + "dst_vshape": (8, 48), # dst valid region + "eps": 0, # exact match for int + }, + # int32 cases + { + "name": "i32_64x64_src0_row_less", + "dtype": np.int32, + "shape": (64, 64), + "valid_shape": (8, 64), # src0 valid region (row_less) + "src1_vshape": (64, 64), # src1 valid region (equals dst) + "dst_vshape": (64, 64), # dst valid region + "eps": 0, # exact match for int + }, +] + +_SMOKE_CASE_NAMES = ['f16_8x48_src0_col_less', 'i16_8x48_src1_col_less'] +_SMOKE_CASE_NAME_SET = set(_SMOKE_CASE_NAMES) +_missing = [name for name in _SMOKE_CASE_NAMES if name not in {case["name"] for case in CASES}] +if _missing: + raise RuntimeError("unknown smoke case(s): " + ", ".join(_missing)) +CASES = [case for case in CASES if case["name"] in _SMOKE_CASE_NAME_SET] diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tpartmul/compare.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tpartmul/compare.py new file mode 100644 index 000000000..296e28e22 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tpartmul/compare.py @@ -0,0 +1,53 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import os +import sys +import numpy as np + +# Add parent directory to path for st_common import +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +from st_common import result_cmp, style_fail, style_pass + +from cases import CASES + + +def main(): + case_filter = sys.argv[1] if len(sys.argv) > 1 else None + + all_passed = True + for case in CASES: + if case_filter is not None and case["name"] != case_filter: + continue + + case_dir = case["name"] + shape = case["shape"] + dtype = case["dtype"] + dst_vr, dst_vc = case["dst_vshape"] + + golden = np.fromfile(os.path.join(case_dir, "golden.bin"), dtype=dtype).reshape(shape) + output = np.fromfile(os.path.join(case_dir, "output.bin"), dtype=dtype).reshape(shape) + + # Compare only the dst valid region + ok = result_cmp(golden[:dst_vr, :dst_vc], output[:dst_vr, :dst_vc], case["eps"]) + if ok: + print(style_pass(f"[INFO] {case['name']}: compare passed")) + else: + print(style_fail(f"[ERROR] {case['name']}: compare failed")) + all_passed = False + + if not all_passed: + sys.exit(2) + print(style_pass("[INFO] all cases passed")) + + +if __name__ == "__main__": + main() diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tpartmul/gen_data.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tpartmul/gen_data.py new file mode 100644 index 000000000..3727e5673 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tpartmul/gen_data.py @@ -0,0 +1,96 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import numpy as np +import os +import sys + +# Add parent directory to path for st_common import +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +from st_common import setup_case_rng, save_case_data + +from cases import CASES + + +def _to_tuple(shape): + """Convert shape to tuple if needed.""" + if isinstance(shape, tuple): + return shape + return tuple(shape) + + +for case in CASES: + setup_case_rng(case) + + dtype = case["dtype"] + shape = _to_tuple(case["shape"]) + src0_valid = _to_tuple(case["valid_shape"]) + src1_valid = _to_tuple(case["src1_vshape"]) + dst_valid = _to_tuple(case["dst_vshape"]) + + rows, cols = shape + src0_vr, src0_vc = src0_valid + src1_vr, src1_vc = src1_valid + dst_vr, dst_vc = dst_valid + + input1 = np.random.randint(1, 10, size=shape).astype(dtype) + input2 = np.random.randint(1, 10, size=shape).astype(dtype) + + golden = np.zeros(shape, dtype=dtype) + + # Compute golden according to tpartmul semantics from template: + # If src0_valid == dst_valid: use tpart_op with src0 as full operand + # - If src1 row_less: mul for src1 region, copy src0 for remaining rows + # - If src1 col_less: copy src0 full, then mul for overlapping region + # If src1_valid == dst_valid: use tpart_op with src1 as full operand (swap src0/src1) + + src0_eq_dst = (src0_vr == dst_vr and src0_vc == dst_vc) + src1_eq_dst = (src1_vr == dst_vr and src1_vc == dst_vc) + + if src0_eq_dst: + # src0 is the full operand matching dst + src1_row_lt_dst = (src1_vr < dst_vr and src1_vc == dst_vc) + src1_col_lt_dst = (src1_vr <= dst_vr and src1_vc < dst_vc) + + if src1_eq_dst: + # Full mul: dst[:] = src0[:] * src1[:] + golden[:dst_vr, :dst_vc] = (input1[:dst_vr, :dst_vc] * input2[:dst_vr, :dst_vc]).astype(dtype, copy=False) + elif src1_col_lt_dst: + # Col_less: first copy src0, then mul in overlapping region + golden[:dst_vr, :dst_vc] = input1[:dst_vr, :dst_vc].copy() + if src1_vc > 0: + golden[:src1_vr, :src1_vc] = (input1[:src1_vr, :src1_vc] * input2[:src1_vr, :src1_vc]).astype(dtype, copy=False) + elif src1_row_lt_dst: + # Row_less: mul for src1 region, copy src0 for remaining rows + if src1_vc > 0: + golden[:src1_vr, :src1_vc] = (input1[:src1_vr, :src1_vc] * input2[:src1_vr, :src1_vc]).astype(dtype, copy=False) + golden[src1_vr:dst_vr, :dst_vc] = input1[src1_vr:dst_vr, :dst_vc].copy() + elif src1_eq_dst: + # src1 is the full operand matching dst, swap src0/src1 in the logic + src0_row_lt_dst = (src0_vr < dst_vr and src0_vc == dst_vc) + src0_col_lt_dst = (src0_vr <= dst_vr and src0_vc < dst_vc) + + if src0_eq_dst: + # Full mul: dst[:] = src0[:] * src1[:] + golden[:dst_vr, :dst_vc] = (input1[:dst_vr, :dst_vc] * input2[:dst_vr, :dst_vc]).astype(dtype, copy=False) + elif src0_col_lt_dst: + # Col_less: first copy src1, then mul in overlapping region + golden[:dst_vr, :dst_vc] = input2[:dst_vr, :dst_vc].copy() + if src0_vc > 0: + golden[:src0_vr, :src0_vc] = (input1[:src0_vr, :src0_vc] * input2[:src0_vr, :src0_vc]).astype(dtype, copy=False) + elif src0_row_lt_dst: + # Row_less: mul for src0 region, copy src1 for remaining rows + if src0_vc > 0: + golden[:src0_vr, :src0_vc] = (input1[:src0_vr, :src0_vc] * input2[:src0_vr, :src0_vc]).astype(dtype, copy=False) + golden[src0_vr:dst_vr, :dst_vc] = input2[src0_vr:dst_vr, :dst_vc].copy() + + save_case_data(case["name"], {"input1": input1, "input2": input2, "golden": golden}) + print(f"[INFO] gen_data: {case['name']} shape={shape} src0_valid={src0_valid} src1_valid={src1_valid} dst_valid={dst_valid} dtype={dtype.__name__}") diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tpartmul/launch.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tpartmul/launch.cpp new file mode 100644 index 000000000..23b5ba1ab --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tpartmul/launch.cpp @@ -0,0 +1,28 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +#include + +#ifndef AICORE +#define AICORE [aicore] +#endif + +// Case 0: f32 64x64 full + +extern "C" __global__ AICORE void TPARTMUL_f16_8x48_src0_col_less(__gm__ uint16_t *a, __gm__ uint16_t *b, __gm__ uint16_t *c); +extern "C" __global__ AICORE void TPARTMUL_i16_8x48_src1_col_less(__gm__ int16_t *a, __gm__ int16_t *b, __gm__ int16_t *c); + +void LaunchTPARTMUL_f16_8x48_src0_col_less(uint16_t *a, uint16_t *b, uint16_t *c, void *stream) { + TPARTMUL_f16_8x48_src0_col_less<<<1, nullptr, stream>>>((__gm__ uint16_t *)a, (__gm__ uint16_t *)b, (__gm__ uint16_t *)c); +} + + + +void LaunchTPARTMUL_i16_8x48_src1_col_less(int16_t *a, int16_t *b, int16_t *c, void *stream) { + TPARTMUL_i16_8x48_src1_col_less<<<1, nullptr, stream>>>((__gm__ int16_t *)a, (__gm__ int16_t *)b, (__gm__ int16_t *)c); +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tpartmul/main.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tpartmul/main.cpp new file mode 100644 index 000000000..a648ff5c3 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tpartmul/main.cpp @@ -0,0 +1,152 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// Host driver for TileLang tpartmul ST — case-table driven. +// Each case launches a different kernel variant, reads/writes from per-case subdirectory. +// Numerical comparison is done externally by compare.py. + +#include "acl/acl.h" +#include "test_common.h" +#include +#include +#include +#include +#include +#include + +using namespace PtoTestCommon; + +// Kernel launch wrappers (defined in launch.cpp) +void LaunchTPARTMUL_f32_64x64_src0_row_less(float *a, float *b, float *c, void *stream); +void LaunchTPARTMUL_f32_64x64_src1_row_less(float *a, float *b, float *c, void *stream); +void LaunchTPARTMUL_f16_8x48_src0_col_less(uint16_t *a, uint16_t *b, uint16_t *c, void *stream); +void LaunchTPARTMUL_i16_8x48_src1_col_less(int16_t *a, int16_t *b, int16_t *c, void *stream); + +using LaunchFn = void (*)(void *, void *, void *, void *); + +struct TestCase { + const char *name; + LaunchFn launch; + size_t rows; // allocated tile rows + size_t cols; // allocated tile cols + size_t src0ValidRows; // src0 effective rows + size_t src0ValidCols; // src0 effective cols + size_t src1ValidRows; // src1 effective rows + size_t src1ValidCols; // src1 effective cols + size_t dstValidRows; // dst effective rows + size_t dstValidCols; // dst effective cols + size_t elemSize; // bytes per element +}; + +static const TestCase kCases[] = { +{"f16_8x48_src0_col_less", reinterpret_cast(LaunchTPARTMUL_f16_8x48_src0_col_less), 8, 48, 8, 16, 8, 48, 8, 48, sizeof(uint16_t)}, +{"i16_8x48_src1_col_less", reinterpret_cast(LaunchTPARTMUL_i16_8x48_src1_col_less), 8, 48, 8, 48, 8, 16, 8, 48, sizeof(int16_t)}, +}; +static constexpr size_t kNumCases = sizeof(kCases) / sizeof(kCases[0]); + +static int RunCase(const TestCase &tc, int deviceId, aclrtStream stream) { + int rc = 0; + const size_t elemCount = tc.rows * tc.cols; + const size_t fileSize = elemCount * tc.elemSize; + + std::printf("[INFO] === case: %s (shape=%zux%zu, src0_valid=%zux%zu, src1_valid=%zux%zu, dst_valid=%zux%zu) ===\n", + tc.name, tc.rows, tc.cols, tc.src0ValidRows, tc.src0ValidCols, + tc.src1ValidRows, tc.src1ValidCols, tc.dstValidRows, tc.dstValidCols); + + // Per-case data directory + std::string caseDir = std::string("./") + tc.name; + size_t src0FileSize = fileSize; + size_t src1FileSize = fileSize; + + void *src0Host = nullptr, *src1Host = nullptr, *dstHost = nullptr; + void *src0Device = nullptr, *src1Device = nullptr, *dstDevice = nullptr; + + aclrtMallocHost((void **)(&src0Host), fileSize); + aclrtMallocHost((void **)(&src1Host), fileSize); + aclrtMallocHost((void **)(&dstHost), fileSize); + + aclrtMalloc((void **)&src0Device, fileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc((void **)&src1Device, fileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc((void **)&dstDevice, fileSize, ACL_MEM_MALLOC_HUGE_FIRST); + + if (!ReadFile((caseDir + "/input1.bin").c_str(), src0FileSize, src0Host, fileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input1.bin\n", caseDir.c_str()); + rc = 1; + } + if (rc == 0 && !ReadFile((caseDir + "/input2.bin").c_str(), src1FileSize, src1Host, fileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input2.bin\n", caseDir.c_str()); + rc = 1; + } + + if (rc == 0) { + aclrtMemcpy(src0Device, fileSize, src0Host, fileSize, ACL_MEMCPY_HOST_TO_DEVICE); + aclrtMemcpy(src1Device, fileSize, src1Host, fileSize, ACL_MEMCPY_HOST_TO_DEVICE); + + tc.launch(src0Device, src1Device, dstDevice, stream); + + aclrtSynchronizeStream(stream); + aclrtMemcpy(dstHost, fileSize, dstDevice, fileSize, ACL_MEMCPY_DEVICE_TO_HOST); + } + + if (rc == 0 && !WriteFile((caseDir + "/output.bin").c_str(), dstHost, fileSize)) { + std::fprintf(stderr, "[ERROR] failed to write %s/output.bin\n", caseDir.c_str()); + rc = 1; + } + + if (src0Device != nullptr) + aclrtFree(src0Device); + if (src1Device != nullptr) + aclrtFree(src1Device); + if (dstDevice != nullptr) + aclrtFree(dstDevice); + if (src0Host != nullptr) + aclrtFreeHost(src0Host); + if (src1Host != nullptr) + aclrtFreeHost(src1Host); + if (dstHost != nullptr) + aclrtFreeHost(dstHost); + + if (rc == 0) + std::printf("[INFO] case %s done\n", tc.name); + return rc; +} + +int main(int argc, char *argv[]) { + // Optional case filter: ./tpartmul [case_name] + const char *caseFilter = (argc > 1) ? argv[1] : nullptr; + + int rc = 0; + int deviceId = 0; + aclrtStream stream = nullptr; + + aclInit(nullptr); + if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { + deviceId = std::atoi(envDevice); + } + aclrtSetDevice(deviceId); + aclrtCreateStream(&stream); + + for (size_t i = 0; i < kNumCases; ++i) { + if (caseFilter != nullptr && std::strcmp(kCases[i].name, caseFilter) != 0) { + continue; + } + int ret = RunCase(kCases[i], deviceId, stream); + if (ret != 0) { + std::fprintf(stderr, "[ERROR] case %s failed\n", kCases[i].name); + rc = 1; + break; + } + } + + if (stream != nullptr) + aclrtDestroyStream(stream); + aclrtResetDevice(deviceId); + aclFinalize(); + + return rc; +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tpartmul/tpartmul.pto b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tpartmul/tpartmul.pto new file mode 100644 index 000000000..d578564f5 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tpartmul/tpartmul.pto @@ -0,0 +1,135 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You can not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// TileLang ST kernels for pto.tpartmul: partial elementwise mul with valid region handling. +// Multiple cases with different valid_shape combinations in a single module. +// Compiled by ptoas --enable-insert-sync --enable-tile-op-expand --pto-backend=vpto +// to produce a fatobj object. + +module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { + // Case 0: f32 64x64 full (src0/src1/dst all have same valid_shape 64x64) + func.func @TPARTMUL_f16_8x48_src0_col_less(%a_ptr: !pto.ptr, %b_ptr: !pto.ptr, %c_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c8 = arith.constant 8 : index + %c16 = arith.constant 16 : index + %c48 = arith.constant 48 : index + %c384 = arith.constant 384 : index + + %a_view = pto.make_tensor_view %a_ptr, + shape = [%c1, %c1, %c1, %c8, %c48], + strides = [%c384, %c384, %c384, %c48, %c1] + : !pto.tensor_view<1x1x1x8x48xf16> + %b_view = pto.make_tensor_view %b_ptr, + shape = [%c1, %c1, %c1, %c8, %c48], + strides = [%c384, %c384, %c384, %c48, %c1] + : !pto.tensor_view<1x1x1x8x48xf16> + %c_view = pto.make_tensor_view %c_ptr, + shape = [%c1, %c1, %c1, %c8, %c48], + strides = [%c384, %c384, %c384, %c48, %c1] + : !pto.tensor_view<1x1x1x8x48xf16> + + %a_part = pto.partition_view %a_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c8, %c48] + : !pto.tensor_view<1x1x1x8x48xf16> -> !pto.partition_tensor_view<1x1x1x8x48xf16> + %b_part = pto.partition_view %b_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c8, %c48] + : !pto.tensor_view<1x1x1x8x48xf16> -> !pto.partition_tensor_view<1x1x1x8x48xf16> + %c_part = pto.partition_view %c_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c8, %c48] + : !pto.tensor_view<1x1x1x8x48xf16> -> !pto.partition_tensor_view<1x1x1x8x48xf16> + + // src0: partial valid region (8,16) + %a = pto.alloc_tile + : !pto.tile_buf + // src1: full valid region (8,48) + %b = pto.alloc_tile + : !pto.tile_buf + // dst: full valid region (8,48) + %c = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%a_part : !pto.partition_tensor_view<1x1x1x8x48xf16>) + outs(%a : !pto.tile_buf) + pto.tload ins(%b_part : !pto.partition_tensor_view<1x1x1x8x48xf16>) + outs(%b : !pto.tile_buf) + + pto.tpartmul ins(%a, %b : !pto.tile_buf, + !pto.tile_buf) + outs(%c : !pto.tile_buf) + + pto.tstore ins(%c : !pto.tile_buf) + outs(%c_part : !pto.partition_tensor_view<1x1x1x8x48xf16>) + return + } + + // Case 6: f16 8x768 src0 col less (src0 valid 8x512, src1/dst valid 8x768) + + func.func @TPARTMUL_i16_8x48_src1_col_less(%a_ptr: !pto.ptr, %b_ptr: !pto.ptr, %c_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c8 = arith.constant 8 : index + %c16 = arith.constant 16 : index + %c48 = arith.constant 48 : index + %c384 = arith.constant 384 : index + + %a_view = pto.make_tensor_view %a_ptr, + shape = [%c1, %c1, %c1, %c8, %c48], + strides = [%c384, %c384, %c384, %c48, %c1] + : !pto.tensor_view<1x1x1x8x48xi16> + %b_view = pto.make_tensor_view %b_ptr, + shape = [%c1, %c1, %c1, %c8, %c48], + strides = [%c384, %c384, %c384, %c48, %c1] + : !pto.tensor_view<1x1x1x8x48xi16> + %c_view = pto.make_tensor_view %c_ptr, + shape = [%c1, %c1, %c1, %c8, %c48], + strides = [%c384, %c384, %c384, %c48, %c1] + : !pto.tensor_view<1x1x1x8x48xi16> + + %a_part = pto.partition_view %a_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c8, %c48] + : !pto.tensor_view<1x1x1x8x48xi16> -> !pto.partition_tensor_view<1x1x1x8x48xi16> + %b_part = pto.partition_view %b_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c8, %c48] + : !pto.tensor_view<1x1x1x8x48xi16> -> !pto.partition_tensor_view<1x1x1x8x48xi16> + %c_part = pto.partition_view %c_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c8, %c48] + : !pto.tensor_view<1x1x1x8x48xi16> -> !pto.partition_tensor_view<1x1x1x8x48xi16> + + // src0: full valid region (8,48) + %a = pto.alloc_tile + : !pto.tile_buf + // src1: partial valid region (8,16) + %b = pto.alloc_tile + : !pto.tile_buf + // dst: full valid region (8,48) + %c = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%a_part : !pto.partition_tensor_view<1x1x1x8x48xi16>) + outs(%a : !pto.tile_buf) + pto.tload ins(%b_part : !pto.partition_tensor_view<1x1x1x8x48xi16>) + outs(%b : !pto.tile_buf) + + pto.tpartmul ins(%a, %b : !pto.tile_buf, + !pto.tile_buf) + outs(%c : !pto.tile_buf) + + pto.tstore ins(%c : !pto.tile_buf) + outs(%c_part : !pto.partition_tensor_view<1x1x1x8x48xi16>) + return + } + + // Case 8: i32 64x64 src0 row less (src0 valid 8x64, src1/dst valid 64x64) +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tprelu/CMakeLists.txt b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tprelu/CMakeLists.txt new file mode 100644 index 000000000..87d9a4d74 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tprelu/CMakeLists.txt @@ -0,0 +1,9 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +pto_tilelang_vec_st(tprelu) diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tprelu/cases.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tprelu/cases.py new file mode 100644 index 000000000..eb980240d --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tprelu/cases.py @@ -0,0 +1,90 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +"""Single source of truth for tprelu ST test cases. + +Each case defines: + - name: case identifier, used as subdirectory name and by main.cpp kCases[]. + - dtype: numpy dtype (e.g. np.float16, np.float32). + - shape: (rows, cols) — allocated tile dimensions. + - valid_shape: (valid_rows, valid_cols) — effective computation region. + - eps: tolerance for numpy.allclose (atol and rtol). + +gen_data.py and compare.py both import this list to avoid redundant definitions. +""" + +import numpy as np + +CASES = [ + { + "name": "f16_64x64", + "dtype": np.float16, + "shape": (64, 64), + "valid_shape": (64, 64), + "eps": 1e-3, + }, + { + "name": "f16_63x63", + "dtype": np.float16, + "shape": (64, 64), + "valid_shape": (63, 63), + "eps": 1e-3, + }, + { + "name": "f16_1x16384", + "dtype": np.float16, + "shape": (1, 16384), + "valid_shape": (1, 16384), + "eps": 1e-3, + }, + { + "name": "f16_2048x16", + "dtype": np.float16, + "shape": (2048, 16), + "valid_shape": (2048, 16), + "eps": 1e-3, + }, + { + "name": "f32_64x64", + "dtype": np.float32, + "shape": (64, 64), + "valid_shape": (64, 64), + "eps": 1e-6, + }, + { + "name": "f32_63x63", + "dtype": np.float32, + "shape": (64, 64), + "valid_shape": (63, 63), + "eps": 1e-6, + }, + { + "name": "f32_1x16384", + "dtype": np.float32, + "shape": (1, 16384), + "valid_shape": (1, 16384), + "eps": 1e-6, + }, + { + "name": "f32_2048x8", + "dtype": np.float32, + "shape": (2048, 8), + "valid_shape": (2048, 8), + "eps": 1e-6, + }, +] + +_SMOKE_CASE_NAMES = ['f16_63x63', 'f32_64x64'] +_SMOKE_CASE_NAME_SET = set(_SMOKE_CASE_NAMES) +_missing = [name for name in _SMOKE_CASE_NAMES if name not in {case["name"] for case in CASES}] +if _missing: + raise RuntimeError("unknown smoke case(s): " + ", ".join(_missing)) +CASES = [case for case in CASES if case["name"] in _SMOKE_CASE_NAME_SET] diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tprelu/compare.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tprelu/compare.py new file mode 100644 index 000000000..428604929 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tprelu/compare.py @@ -0,0 +1,49 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import os +import sys +import numpy as np + +from cases import CASES +from st_common import result_cmp, style_fail, style_pass, validate_cases + + +def main(): + validate_cases(CASES) + case_filter = sys.argv[1] if len(sys.argv) > 1 else None + + all_passed = True + for case in CASES: + if case_filter is not None and case["name"] != case_filter: + continue + + case_dir = case["name"] + shape = case["shape"] + vr, vc = case["valid_shape"] + + golden = np.fromfile(os.path.join(case_dir, "golden.bin"), dtype=case["dtype"]).reshape(shape) + output = np.fromfile(os.path.join(case_dir, "output.bin"), dtype=case["dtype"]).reshape(shape) + + ok = result_cmp(golden[:vr, :vc], output[:vr, :vc], case["eps"]) + if ok: + print(style_pass(f"[INFO] {case['name']}: compare passed")) + else: + print(style_fail(f"[ERROR] {case['name']}: compare failed")) + all_passed = False + + if not all_passed: + sys.exit(2) + print(style_pass("[INFO] all cases passed")) + + +if __name__ == "__main__": + main() diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tprelu/gen_data.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tprelu/gen_data.py new file mode 100644 index 000000000..3a3ece4d0 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tprelu/gen_data.py @@ -0,0 +1,40 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import numpy as np +from cases import CASES +from st_common import validate_cases, setup_case_rng, save_case_data + +validate_cases(CASES) + +for case in CASES: + setup_case_rng(case) + + dtype = case["dtype"] + shape = case["shape"] + valid_shape = case["valid_shape"] + + rows, cols = shape + vr, vc = valid_shape + + input0 = np.random.uniform(-8, high=8, size=(rows, cols)).astype(dtype) + input1 = np.random.uniform(-8, high=8, size=(rows, cols)).astype(dtype) + + golden = np.zeros((rows, cols), dtype=dtype) + for i in range(vr): + for j in range(vc): + if input0[i, j] > 0: + golden[i, j] = input0[i, j] + else: + golden[i, j] = dtype(input0[i, j] * input1[i, j]) + + save_case_data(case["name"], {"input0": input0, "input1": input1, "golden": golden}) + print(f"[INFO] gen_data: {case['name']} shape={shape} valid_shape={valid_shape} dtype={dtype.__name__}") diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tprelu/launch.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tprelu/launch.cpp new file mode 100644 index 000000000..c777384ae --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tprelu/launch.cpp @@ -0,0 +1,28 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +#include + +#ifndef AICORE +#define AICORE [aicore] +#endif + +// Case 0: f16 64x64 + +extern "C" __global__ AICORE void TPRELU_f16_63x63(__gm__ uint16_t *src0, __gm__ uint16_t *src1, __gm__ uint16_t *dst); +extern "C" __global__ AICORE void TPRELU_f32_64x64(__gm__ float *src0, __gm__ float *src1, __gm__ float *dst); + +void LaunchTPRELU_f16_63x63(uint16_t *src0, uint16_t *src1, uint16_t *dst, void *stream) { + TPRELU_f16_63x63<<<1, nullptr, stream>>>((__gm__ uint16_t *)src0, (__gm__ uint16_t *)src1, (__gm__ uint16_t *)dst); +} + + + +void LaunchTPRELU_f32_64x64(float *src0, float *src1, float *dst, void *stream) { + TPRELU_f32_64x64<<<1, nullptr, stream>>>((__gm__ float *)src0, (__gm__ float *)src1, (__gm__ float *)dst); +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tprelu/main.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tprelu/main.cpp new file mode 100644 index 000000000..8ec4f2208 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tprelu/main.cpp @@ -0,0 +1,189 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// Host driver for TileLang tprelu ST — case-table driven. +// Each case launches a different kernel variant, reads/writes from per-case subdirectory. +// Numerical comparison is done externally by compare.py. + +#include "acl/acl.h" +#include "test_common.h" +#include +#include +#include +#include +#include +#include + +using namespace PtoTestCommon; + +// Kernel launch wrappers (defined in launch.cpp) +void LaunchTPRELU_f16_63x63(uint16_t *src0, uint16_t *src1, uint16_t *dst, void *stream); +void LaunchTPRELU_f16_2048x16(uint16_t *src0, uint16_t *src1, uint16_t *dst, void *stream); +void LaunchTPRELU_f32_64x64(float *src0, float *src1, float *dst, void *stream); +void LaunchTPRELU_f32_63x63(float *src0, float *src1, float *dst, void *stream); +void LaunchTPRELU_f32_2048x8(float *src0, float *src1, float *dst, void *stream); + +enum DataType { F16, F32 }; + +struct TestCase { + const char *name; + DataType dtype; + void * launch; + size_t rows; + size_t cols; + size_t validRows; + size_t validCols; +}; + +static const TestCase kCases[] = { +{"f16_63x63", F16, (void*)LaunchTPRELU_f16_63x63, 64, 64, 63, 63}, +{"f32_64x64", F32, (void*)LaunchTPRELU_f32_64x64, 64, 64, 64, 64}, +}; +static constexpr size_t kNumCases = sizeof(kCases) / sizeof(kCases[0]); + +template +using LaunchFn = void (*)(T *, T *, T *, void *); + +static int RunCase(const TestCase &tc, int deviceId, aclrtStream stream) { + int rc = 0; + const size_t elemCount = tc.rows * tc.cols; + const size_t elemSize = (tc.dtype == F16) ? sizeof(uint16_t) : sizeof(float); + size_t fileSize = elemCount * elemSize; + + std::printf("[INFO] === case: %s (shape=%zux%zu, valid=%zux%zu, dtype=%s) ===\n", + tc.name, tc.rows, tc.cols, tc.validRows, tc.validCols, + (tc.dtype == F16) ? "f16" : "f32"); + + std::string caseDir = std::string("./") + tc.name; + + if (tc.dtype == F16) { + uint16_t *src0Host = nullptr, *src1Host = nullptr, *dstHost = nullptr; + uint16_t *src0Device = nullptr, *src1Device = nullptr, *dstDevice = nullptr; + + aclrtMallocHost((void **)(&src0Host), fileSize); + aclrtMallocHost((void **)(&src1Host), fileSize); + aclrtMallocHost((void **)(&dstHost), fileSize); + + aclrtMalloc((void **)&src0Device, fileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc((void **)&src1Device, fileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc((void **)&dstDevice, fileSize, ACL_MEM_MALLOC_HUGE_FIRST); + + if (!ReadFile((caseDir + "/input0.bin").c_str(), fileSize, src0Host, fileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input0.bin\n", caseDir.c_str()); + rc = 1; + } + if (rc == 0 && !ReadFile((caseDir + "/input1.bin").c_str(), fileSize, src1Host, fileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input1.bin\n", caseDir.c_str()); + rc = 1; + } + + if (rc == 0) { + aclrtMemcpy(src0Device, fileSize, src0Host, fileSize, ACL_MEMCPY_HOST_TO_DEVICE); + aclrtMemcpy(src1Device, fileSize, src1Host, fileSize, ACL_MEMCPY_HOST_TO_DEVICE); + + LaunchFn launch = (LaunchFn)tc.launch; + launch(src0Device, src1Device, dstDevice, stream); + + aclrtSynchronizeStream(stream); + aclrtMemcpy(dstHost, fileSize, dstDevice, fileSize, ACL_MEMCPY_DEVICE_TO_HOST); + } + + if (rc == 0 && !WriteFile((caseDir + "/output.bin").c_str(), dstHost, fileSize)) { + std::fprintf(stderr, "[ERROR] failed to write %s/output.bin\n", caseDir.c_str()); + rc = 1; + } + + if (src0Device != nullptr) aclrtFree(src0Device); + if (src1Device != nullptr) aclrtFree(src1Device); + if (dstDevice != nullptr) aclrtFree(dstDevice); + if (src0Host != nullptr) aclrtFreeHost(src0Host); + if (src1Host != nullptr) aclrtFreeHost(src1Host); + if (dstHost != nullptr) aclrtFreeHost(dstHost); + } else { + float *src0Host = nullptr, *src1Host = nullptr, *dstHost = nullptr; + float *src0Device = nullptr, *src1Device = nullptr, *dstDevice = nullptr; + + aclrtMallocHost((void **)(&src0Host), fileSize); + aclrtMallocHost((void **)(&src1Host), fileSize); + aclrtMallocHost((void **)(&dstHost), fileSize); + + aclrtMalloc((void **)&src0Device, fileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc((void **)&src1Device, fileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc((void **)&dstDevice, fileSize, ACL_MEM_MALLOC_HUGE_FIRST); + + if (!ReadFile((caseDir + "/input0.bin").c_str(), fileSize, src0Host, fileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input0.bin\n", caseDir.c_str()); + rc = 1; + } + if (rc == 0 && !ReadFile((caseDir + "/input1.bin").c_str(), fileSize, src1Host, fileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input1.bin\n", caseDir.c_str()); + rc = 1; + } + + if (rc == 0) { + aclrtMemcpy(src0Device, fileSize, src0Host, fileSize, ACL_MEMCPY_HOST_TO_DEVICE); + aclrtMemcpy(src1Device, fileSize, src1Host, fileSize, ACL_MEMCPY_HOST_TO_DEVICE); + + LaunchFn launch = (LaunchFn)tc.launch; + launch(src0Device, src1Device, dstDevice, stream); + + aclrtSynchronizeStream(stream); + aclrtMemcpy(dstHost, fileSize, dstDevice, fileSize, ACL_MEMCPY_DEVICE_TO_HOST); + } + + if (rc == 0 && !WriteFile((caseDir + "/output.bin").c_str(), dstHost, fileSize)) { + std::fprintf(stderr, "[ERROR] failed to write %s/output.bin\n", caseDir.c_str()); + rc = 1; + } + + if (src0Device != nullptr) aclrtFree(src0Device); + if (src1Device != nullptr) aclrtFree(src1Device); + if (dstDevice != nullptr) aclrtFree(dstDevice); + if (src0Host != nullptr) aclrtFreeHost(src0Host); + if (src1Host != nullptr) aclrtFreeHost(src1Host); + if (dstHost != nullptr) aclrtFreeHost(dstHost); + } + + if (rc == 0) + std::printf("[INFO] case %s done\n", tc.name); + return rc; +} + +int main(int argc, char *argv[]) { + const char *caseFilter = (argc > 1) ? argv[1] : nullptr; + + int rc = 0; + int deviceId = 0; + aclrtStream stream = nullptr; + + aclInit(nullptr); + if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { + deviceId = std::atoi(envDevice); + } + aclrtSetDevice(deviceId); + aclrtCreateStream(&stream); + + for (size_t i = 0; i < kNumCases; ++i) { + if (caseFilter != nullptr && std::strcmp(kCases[i].name, caseFilter) != 0) { + continue; + } + int ret = RunCase(kCases[i], deviceId, stream); + if (ret != 0) { + std::fprintf(stderr, "[ERROR] case %s failed\n", kCases[i].name); + rc = 1; + break; + } + } + + if (stream != nullptr) + aclrtDestroyStream(stream); + aclrtResetDevice(deviceId); + aclFinalize(); + + return rc; +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tprelu/tprelu.pto b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tprelu/tprelu.pto new file mode 100644 index 000000000..f5b262c6a --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tprelu/tprelu.pto @@ -0,0 +1,143 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// TileLang ST kernels for pto.tprelu: tload(src0) + tload(src1) + tprelu(src0,src1,tmp)->dst + tstore(dst). +// Multiple cases with different shapes in a single module. +// Compiled by ptoas --enable-insert-sync --enable-tile-op-expand --pto-backend=vpto +// to produce a fatobj object. + +module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { + // Case 0: f16 64x64 (4096 elements) + func.func @TPRELU_f16_63x63(%src0_ptr: !pto.ptr, %src1_ptr: !pto.ptr, %dst_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c63 = arith.constant 63 : index + %c64 = arith.constant 64 : index + %c4096 = arith.constant 4096 : index + + %src0_view = pto.make_tensor_view %src0_ptr, + shape = [%c1, %c1, %c1, %c64, %c64], + strides = [%c4096, %c4096, %c4096, %c64, %c1] + : !pto.tensor_view<1x1x1x64x64xf16> + %src1_view = pto.make_tensor_view %src1_ptr, + shape = [%c1, %c1, %c1, %c64, %c64], + strides = [%c4096, %c4096, %c4096, %c64, %c1] + : !pto.tensor_view<1x1x1x64x64xf16> + %dst_view = pto.make_tensor_view %dst_ptr, + shape = [%c1, %c1, %c1, %c64, %c64], + strides = [%c4096, %c4096, %c4096, %c64, %c1] + : !pto.tensor_view<1x1x1x64x64xf16> + + %src0_part = pto.partition_view %src0_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c63, %c63] + : !pto.tensor_view<1x1x1x64x64xf16> -> !pto.partition_tensor_view<1x1x1x63x63xf16> + %src1_part = pto.partition_view %src1_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c63, %c63] + : !pto.tensor_view<1x1x1x64x64xf16> -> !pto.partition_tensor_view<1x1x1x63x63xf16> + %dst_part = pto.partition_view %dst_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c63, %c63] + : !pto.tensor_view<1x1x1x64x64xf16> -> !pto.partition_tensor_view<1x1x1x63x63xf16> + + %src0_tile = pto.alloc_tile + : !pto.tile_buf + %src1_tile = pto.alloc_tile + : !pto.tile_buf + %tmp_tile = pto.alloc_tile + : !pto.tile_buf + %dst_tile = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%src0_part : !pto.partition_tensor_view<1x1x1x63x63xf16>) + outs(%src0_tile : !pto.tile_buf) + pto.tload ins(%src1_part : !pto.partition_tensor_view<1x1x1x63x63xf16>) + outs(%src1_tile : !pto.tile_buf) + + pto.tprelu ins(%src0_tile, %src1_tile, %tmp_tile : !pto.tile_buf, + !pto.tile_buf, + !pto.tile_buf) + outs(%dst_tile : !pto.tile_buf) + + pto.tstore ins(%dst_tile : !pto.tile_buf) + outs(%dst_part : !pto.partition_tensor_view<1x1x1x63x63xf16>) + return + } + + // Case 2: f16 1x16384 + + func.func @TPRELU_f32_64x64(%src0_ptr: !pto.ptr, %src1_ptr: !pto.ptr, %dst_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c64 = arith.constant 64 : index + %c4096 = arith.constant 4096 : index + + %src0_view = pto.make_tensor_view %src0_ptr, + shape = [%c1, %c1, %c1, %c64, %c64], + strides = [%c4096, %c4096, %c4096, %c64, %c1] + : !pto.tensor_view<1x1x1x64x64xf32> + %src1_view = pto.make_tensor_view %src1_ptr, + shape = [%c1, %c1, %c1, %c64, %c64], + strides = [%c4096, %c4096, %c4096, %c64, %c1] + : !pto.tensor_view<1x1x1x64x64xf32> + %dst_view = pto.make_tensor_view %dst_ptr, + shape = [%c1, %c1, %c1, %c64, %c64], + strides = [%c4096, %c4096, %c4096, %c64, %c1] + : !pto.tensor_view<1x1x1x64x64xf32> + + %src0_part = pto.partition_view %src0_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c64, %c64] + : !pto.tensor_view<1x1x1x64x64xf32> -> !pto.partition_tensor_view<1x1x1x64x64xf32> + %src1_part = pto.partition_view %src1_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c64, %c64] + : !pto.tensor_view<1x1x1x64x64xf32> -> !pto.partition_tensor_view<1x1x1x64x64xf32> + %dst_part = pto.partition_view %dst_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c64, %c64] + : !pto.tensor_view<1x1x1x64x64xf32> -> !pto.partition_tensor_view<1x1x1x64x64xf32> + + %src0_tile = pto.alloc_tile + : !pto.tile_buf + %src1_tile = pto.alloc_tile + : !pto.tile_buf + %tmp_tile = pto.alloc_tile + : !pto.tile_buf + %dst_tile = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%src0_part : !pto.partition_tensor_view<1x1x1x64x64xf32>) + outs(%src0_tile : !pto.tile_buf) + pto.tload ins(%src1_part : !pto.partition_tensor_view<1x1x1x64x64xf32>) + outs(%src1_tile : !pto.tile_buf) + + pto.tprelu ins(%src0_tile, %src1_tile, %tmp_tile : !pto.tile_buf, + !pto.tile_buf, + !pto.tile_buf) + outs(%dst_tile : !pto.tile_buf) + + pto.tstore ins(%dst_tile : !pto.tile_buf) + outs(%dst_part : !pto.partition_tensor_view<1x1x1x64x64xf32>) + return + } + + // Case 5: f32 63x63 (partial valid_shape) +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trandom/CMakeLists.txt b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trandom/CMakeLists.txt new file mode 100644 index 000000000..130b20fce --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trandom/CMakeLists.txt @@ -0,0 +1,9 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +pto_tilelang_vec_st(trandom) diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trandom/cases.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trandom/cases.py new file mode 100644 index 000000000..79e320e92 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trandom/cases.py @@ -0,0 +1,51 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +"""Single source of truth for trandom ST test cases. + +Each case defines: + - name: case identifier, used as subdirectory name and by main.cpp kCases[]. + - dtype: numpy dtype (np.int32 or np.uint32). + - shape: (rows, cols) — allocated tile dimensions. + - valid_shape: (valid_rows, valid_cols) — effective computation region. + - rounds: Philox rounds (7 or 10). + - eps: tolerance for comparison (0 for exact match). + +gen_data.py and compare.py both import this list to avoid redundant definitions. +""" + +import numpy as np + +CASES = [ + { + "name": "int32_1x256", + "dtype": np.int32, + "shape": (1, 256), + "valid_shape": (1, 256), + "rounds": 10, + "eps": 0, + }, + { + "name": "int32_4x256", + "dtype": np.int32, + "shape": (4, 256), + "valid_shape": (4, 256), + "rounds": 10, + "eps": 0, + }, +] + +_SMOKE_CASE_NAMES = ['int32_1x256', 'int32_4x256'] +_SMOKE_CASE_NAME_SET = set(_SMOKE_CASE_NAMES) +_missing = [name for name in _SMOKE_CASE_NAMES if name not in {case["name"] for case in CASES}] +if _missing: + raise RuntimeError("unknown smoke case(s): " + ", ".join(_missing)) +CASES = [case for case in CASES if case["name"] in _SMOKE_CASE_NAME_SET] diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trandom/compare.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trandom/compare.py new file mode 100644 index 000000000..155c4d1e8 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trandom/compare.py @@ -0,0 +1,82 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +"""Compare golden reference with NPU output for trandom test cases.""" + +import os +import sys +import numpy as np + +from cases import CASES +from st_common import result_cmp, style_fail, style_pass, validate_cases +from gen_data import trandom_generate + + +def main(): + validate_cases(CASES) + case_filter = sys.argv[1] if len(sys.argv) > 1 else None + + all_passed = True + for case in CASES: + if case_filter is not None and case["name"] != case_filter: + continue + + case_dir = case["name"] + shape = case["shape"] + valid_shape = case["valid_shape"] + vr, vc = valid_shape + dtype = case["dtype"] + eps = case["eps"] + + golden_file = os.path.join(case_dir, "golden.bin") + output_file = os.path.join(case_dir, "output.bin") + key_file = os.path.join(case_dir, "key.bin") + counter_file = os.path.join(case_dir, "counter.bin") + + if not os.path.exists(golden_file): + if os.path.exists(key_file) and os.path.exists(counter_file) and os.path.exists(output_file): + key = np.fromfile(key_file, dtype=dtype) + counter = np.fromfile(counter_file, dtype=dtype) + rounds = case.get("rounds", 10) + golden = trandom_generate(key.view(np.uint32), counter.view(np.uint32), + vr, vc, dtype=dtype, rounds=rounds) + golden.astype(dtype).tofile(golden_file) + print(f"[INFO] {case['name']}: generated golden.bin") + else: + print(style_fail(f"[ERROR] {case['name']}: golden.bin not found and cannot generate")) + all_passed = False + continue + + if not os.path.exists(output_file): + print(style_fail(f"[ERROR] {case['name']}: output.bin not found")) + all_passed = False + continue + + golden = np.fromfile(golden_file, dtype=dtype).reshape(shape) + output = np.fromfile(output_file, dtype=dtype).reshape(shape) + + ok = result_cmp(golden[:vr, :vc], output[:vr, :vc], eps) + if ok: + unique_count = len(np.unique(output[:vr, :vc])) + total_count = vr * vc + print(style_pass(f"[INFO] {case['name']}: compare passed " + f"(unique={unique_count}/{total_count})")) + else: + print(style_fail(f"[ERROR] {case['name']}: compare failed")) + all_passed = False + + if not all_passed: + sys.exit(2) + print(style_pass("[INFO] all cases passed")) + + +if __name__ == "__main__": + main() diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trandom/debug_trandom.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trandom/debug_trandom.py new file mode 100644 index 000000000..26fc5cc10 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trandom/debug_trandom.py @@ -0,0 +1,112 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +"""Debug script to trace trandom computation step by step.""" + +import numpy as np + +TRANDOM_CONST_0 = 0xD2511F53 +TRANDOM_CONST_1 = 0xCD9E8D57 +TRANDOM_CONST_KEY_ADD_0 = 0x9E3779B9 +TRANDOM_CONST_KEY_ADD_1 = 0xBB67AE85 + +def add_with_128bits_debug(ctr0, ctr1, ctr2, ctr3, value): + """Simulate 128-bit addition with carry propagation.""" + ctr0_new = ctr0.astype(np.uint64) + value.astype(np.uint64) + carry0 = (ctr0_new > 0xFFFFFFFF).astype(np.uint32) + ctr0_new = ctr0_new.astype(np.uint32) + + ctr1_new = ctr1.astype(np.uint64) + carry0.astype(np.uint64) + carry1 = (ctr1_new > 0xFFFFFFFF).astype(np.uint32) + ctr1_new = ctr1_new.astype(np.uint32) + + ctr2_new = ctr2.astype(np.uint64) + carry1.astype(np.uint64) + carry2 = (ctr2_new > 0xFFFFFFFF).astype(np.uint32) + ctr2_new = ctr2_new.astype(np.uint32) + + ctr3_new = ctr3.astype(np.uint64) + carry2.astype(np.uint64) + ctr3_new = ctr3_new.astype(np.uint32) + + return ctr0_new, ctr1_new, ctr2_new, ctr3_new + +def trandom_kernel_debug(ctr0, ctr1, ctr2, ctr3, key0_val, key1_val, rounds=10): + """Philox kernel with detailed logging.""" + lanes = len(ctr0) + key0 = np.full(lanes, np.uint32(key0_val), dtype=np.uint32) + key1 = np.full(lanes, np.uint32(key1_val), dtype=np.uint32) + + print(f"Initial counters: ctr0[0:5]={ctr0[0:5]}, ctr1[0:5]={ctr1[0:5]}") + print(f"Initial keys: key0={key0[0]}, key1={key1[0]}") + + for round_idx in range(rounds): + print(f"\n=== Round {round_idx} ===") + print(f"Before: ctr0[0]={ctr0[0]}, ctr1[0]={ctr1[0]}, ctr2[0]={ctr2[0]}, ctr3[0]={ctr3[0]}") + print(f"Before: key0={key0[0]}, key1={key1[0]}") + + prod0 = ctr0.astype(np.uint64) * np.uint64(TRANDOM_CONST_0) + prod1 = ctr2.astype(np.uint64) * np.uint64(TRANDOM_CONST_1) + + L0 = prod0.astype(np.uint32) + H0 = (prod0 >> 32).astype(np.uint32) + L1 = prod1.astype(np.uint32) + H1 = (prod1 >> 32).astype(np.uint32) + + print(f"prod0[0]={prod0[0]}, L0[0]={L0[0]}, H0[0]={H0[0]}") + print(f"prod1[0]={prod1[0]}, L1[0]={L1[0]}, H1[0]={H1[0]}") + + ctr0 = (H1 ^ ctr1) ^ key0 + ctr2 = (H0 ^ ctr3) ^ key1 + + print(f"ctr0[0] = (H1[0] ^ ctr1[0]) ^ key0[0] = ({H1[0]} ^ {ctr1[0]}) ^ {key0[0]} = {ctr0[0]}") + print(f"ctr2[0] = (H0[0] ^ ctr3[0]) ^ key1[0] = ({H0[0]} ^ {ctr3[0]}) ^ {key1[0]} = {ctr2[0]}") + + key0 = (key0.astype(np.uint32) + np.uint32(TRANDOM_CONST_KEY_ADD_0)) & np.uint32(0xFFFFFFFF) + key1 = (key1.astype(np.uint32) + np.uint32(TRANDOM_CONST_KEY_ADD_1)) & np.uint32(0xFFFFFFFF) + + print(f"key0={key0[0]}, key1={key1[0]} (after update)") + + ctr1 = L1 + ctr3 = L0 + + print(f"After: ctr0[0]={ctr0[0]}, ctr1[0]={ctr1[0]}, ctr2[0]={ctr2[0]}, ctr3[0]={ctr3[0]}") + + return ctr0, ctr1, ctr2, ctr3 + +key = np.array([-792737938, 2139558336], dtype=np.int32) +counter = np.array([-1759534764, -1881674653, 640338625, 1381573024], dtype=np.int32) + +key_uint = key.view(np.uint32) +counter_uint = counter.view(np.uint32) + +lanes = 64 +ctr0 = np.full(lanes, counter_uint[0], dtype=np.uint32) +ctr1 = np.full(lanes, counter_uint[1], dtype=np.uint32) +ctr2 = np.full(lanes, counter_uint[2], dtype=np.uint32) +ctr3 = np.full(lanes, counter_uint[3], dtype=np.uint32) + +print("=== Initial counter values ===") +print(f"ctr0[0]={ctr0[0]}, ctr1[0]={ctr1[0]}, ctr2[0]={ctr2[0]}, ctr3[0]={ctr3[0]}") + +inc_idx = np.arange(lanes, dtype=np.uint32) +ctr0, ctr1, ctr2, ctr3 = add_with_128bits_debug(ctr0, ctr1, ctr2, ctr3, inc_idx) + +print("\n=== After adding index ===") +print(f"ctr0[0:5]={ctr0[0:5]}") +print(f"ctr1[0:5]={ctr1[0:5]}") +print(f"ctr2[0:5]={ctr2[0:5]}") +print(f"ctr3[0:5]={ctr3[0:5]}") + +result = trandom_kernel_debug(ctr0.copy(), ctr1.copy(), ctr2.copy(), ctr3.copy(), + key_uint[0], key_uint[1], rounds=10) + +print("\n=== Final result ===") +print(f"ctr0[0:5]={result[0][0:5]}") +print(f"ctr1[0:5]={result[1][0:5]}") +print(f"ctr2[0:5]={result[2][0:5]}") +print(f"ctr3[0:5]={result[3][0:5]}") diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trandom/gen_data.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trandom/gen_data.py new file mode 100644 index 000000000..73a61b61d --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trandom/gen_data.py @@ -0,0 +1,235 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +"""Generate input data and golden output for trandom test cases. + +Implements the Philox-based TRandom algorithm in pure Python/NumPy +to generate reference golden data for comparison with NPU output. + +Flow: + - First run (no output.bin): generate key/counter inputs only + - Second run (with output.bin): read saved key/counter, compute golden +""" + +import os +import numpy as np +from cases import CASES +from st_common import validate_cases, setup_case_rng, save_case_data + +TRANDOM_ONCE_REPEAT = 4 +TRANDOM_CONST_0 = 0xD2511F53 +TRANDOM_CONST_1 = 0xCD9E8D57 +TRANDOM_CONST_KEY_ADD_0 = 0x9E3779B9 +TRANDOM_CONST_KEY_ADD_1 = 0xBB67AE85 + + +def add_with_128bits(ctr0, ctr1, ctr2, ctr3, value): + """Simulate 128-bit addition with carry propagation.""" + ctr0_new = ctr0.astype(np.uint64) + value.astype(np.uint64) + carry0 = (ctr0_new > 0xFFFFFFFF).astype(np.uint32) + ctr0_new = ctr0_new.astype(np.uint32) + + ctr1_new = ctr1.astype(np.uint64) + carry0.astype(np.uint64) + carry1 = (ctr1_new > 0xFFFFFFFF).astype(np.uint32) + ctr1_new = ctr1_new.astype(np.uint32) + + ctr2_new = ctr2.astype(np.uint64) + carry1.astype(np.uint64) + carry2 = (ctr2_new > 0xFFFFFFFF).astype(np.uint32) + ctr2_new = ctr2_new.astype(np.uint32) + + ctr3_new = ctr3.astype(np.uint64) + carry2.astype(np.uint64) + ctr3_new = ctr3_new.astype(np.uint32) + + return ctr0_new, ctr1_new, ctr2_new, ctr3_new + + +def trandom_kernel(ctr0, ctr1, ctr2, ctr3, key0_val, key1_val, rounds=10): + """Philox-based random number generation kernel. + + Uses unsigned multiply to match C++ TRandomKernel (RegTensor, vmull.v64u32). + """ + key0 = np.full(len(ctr0), key0_val, dtype=np.uint32) + key1 = np.full(len(ctr0), key1_val, dtype=np.uint32) + + for _ in range(rounds): + prod0 = ctr0.astype(np.uint64) * np.uint64(TRANDOM_CONST_0) + prod1 = ctr2.astype(np.uint64) * np.uint64(TRANDOM_CONST_1) + + L0 = prod0.astype(np.uint32) + H0 = (prod0 >> 32).astype(np.uint32) + L1 = prod1.astype(np.uint32) + H1 = (prod1 >> 32).astype(np.uint32) + + ctr0 = (H1 ^ ctr1) ^ key0 + ctr2 = (H0 ^ ctr3) ^ key1 + ctr1 = L1 + ctr3 = L0 + + key0 = (key0 + TRANDOM_CONST_KEY_ADD_0) & np.uint32(0xFFFFFFFF) + key1 = (key1 + TRANDOM_CONST_KEY_ADD_1) & np.uint32(0xFFFFFFFF) + + return ctr0, ctr1, ctr2, ctr3 + + +def interleave_values(ctr0, ctr1, ctr2, ctr3): + """Simulate vintlv: interleave values to reorder random numbers. + + vintlv semantics (N=64, half=32): + - low[2*i] = src0[i], low[2*i+1] = src1[i] for i in 0..31 (interleave first half) + - high[2*i] = src0[i+32], high[2*i+1] = src1[i+32] for i in 0..31 (interleave second half) + + TRandom uses: + 1. vintlv(tmpL0, tmpH0, ctr0, ctr2) + 2. vintlv(tmpL1, tmpH1, ctr1, ctr3) + 3. vintlv(ctr0, ctr1, tmpL0, tmpL1) + 4. vintlv(ctr2, ctr3, tmpH0, tmpH1) + """ + n = len(ctr0) + half = n // 2 + + tmpL0 = np.empty(n, dtype=np.uint32) + tmpH0 = np.empty(n, dtype=np.uint32) + tmpL1 = np.empty(n, dtype=np.uint32) + tmpH1 = np.empty(n, dtype=np.uint32) + + for i in range(half): + tmpL0[2*i] = ctr0[i] + tmpL0[2*i+1] = ctr2[i] + tmpH0[2*i] = ctr0[i + half] + tmpH0[2*i+1] = ctr2[i + half] + + tmpL1[2*i] = ctr1[i] + tmpL1[2*i+1] = ctr3[i] + tmpH1[2*i] = ctr1[i + half] + tmpH1[2*i+1] = ctr3[i + half] + + result0 = np.empty(n, dtype=np.uint32) + result1 = np.empty(n, dtype=np.uint32) + result2 = np.empty(n, dtype=np.uint32) + result3 = np.empty(n, dtype=np.uint32) + + for i in range(half): + result0[2*i] = tmpL0[i] + result0[2*i+1] = tmpL1[i] + result1[2*i] = tmpL0[i + half] + result1[2*i+1] = tmpL1[i + half] + + result2[2*i] = tmpH0[i] + result2[2*i+1] = tmpH1[i] + result3[2*i] = tmpH0[i + half] + result3[2*i+1] = tmpH1[i + half] + + return result0, result1, result2, result3 + + +def trandom_generate(key, counter, valid_rows, valid_cols, dtype=np.int32, rounds=10): + """Generate random numbers using TRandom algorithm. + + Args: + key: 2-element array (key0, key1) - scalar values, broadcast to all lanes + counter: 4-element array (counter0-3) - 128-bit counter base value + valid_rows: number of rows to generate + valid_cols: number of columns to generate + dtype: output dtype (int32 or uint32) + rounds: number of Philox rounds (7 or 10) + + Returns: + output: (valid_rows, valid_cols) array of random numbers + """ + lanes = 64 + n_loop = (valid_cols + TRANDOM_ONCE_REPEAT * lanes - 1) // (TRANDOM_ONCE_REPEAT * lanes) + + output = np.zeros((valid_rows, valid_cols), dtype=np.uint32) + + key0_val = np.uint32(key[0]) + key1_val = np.uint32(key[1]) + + ctr0 = np.full(lanes, np.uint32(counter[0]), dtype=np.uint32) + ctr1 = np.full(lanes, np.uint32(counter[1]), dtype=np.uint32) + ctr2 = np.full(lanes, np.uint32(counter[2]), dtype=np.uint32) + ctr3 = np.full(lanes, np.uint32(counter[3]), dtype=np.uint32) + + inc_idx = np.arange(lanes, dtype=np.uint32) + ctr0, ctr1, ctr2, ctr3 = add_with_128bits(ctr0, ctr1, ctr2, ctr3, inc_idx) + + for i in range(valid_rows): + s_reg = valid_cols + counter_add_val = lanes + + for j in range(n_loop): + tmp_ctr0 = ctr0.copy() + tmp_ctr1 = ctr1.copy() + tmp_ctr2 = ctr2.copy() + tmp_ctr3 = ctr3.copy() + + tmp_ctr0, tmp_ctr1, tmp_ctr2, tmp_ctr3 = trandom_kernel( + tmp_ctr0, tmp_ctr1, tmp_ctr2, tmp_ctr3, key0_val, key1_val, rounds=rounds + ) + + # Apply interleave to match vintlv semantics in trandom_template.py + # This produces element-wise interleaved order: [ctr0[0], ctr1[0], ctr2[0], ctr3[0], ...] + tmp_ctr0, tmp_ctr1, tmp_ctr2, tmp_ctr3 = interleave_values( + tmp_ctr0, tmp_ctr1, tmp_ctr2, tmp_ctr3 + ) + + for k in range(TRANDOM_ONCE_REPEAT): + start_col = TRANDOM_ONCE_REPEAT * j * lanes + k * lanes + end_col = min(start_col + lanes, valid_cols) + num_valid = end_col - start_col + + if num_valid > 0: + vals = [tmp_ctr0, tmp_ctr1, tmp_ctr2, tmp_ctr3][k] + output[i, start_col:end_col] = vals[:num_valid] + + if s_reg >= TRANDOM_ONCE_REPEAT * lanes: + s_reg = s_reg - TRANDOM_ONCE_REPEAT * lanes + else: + s_reg = 0 + + counter_add_val = lanes if j != n_loop - 1 else ((valid_cols - 1) % lanes + 1) + v_ele_stride = np.full(lanes, np.uint32(counter_add_val), dtype=np.uint32) + ctr0, ctr1, ctr2, ctr3 = add_with_128bits(ctr0, ctr1, ctr2, ctr3, v_ele_stride) + + return output.view(dtype) + + +validate_cases(CASES) + +for case in CASES: + case_dir = case["name"] + key_file = os.path.join(case_dir, "key.bin") + counter_file = os.path.join(case_dir, "counter.bin") + output_file = os.path.join(case_dir, "output.bin") + + dtype = case["dtype"] + valid_rows, valid_cols = case["valid_shape"] + rounds = case.get("rounds", 10) + + if os.path.exists(key_file) and os.path.exists(counter_file): + key = np.fromfile(key_file, dtype=dtype) + counter = np.fromfile(counter_file, dtype=dtype) + print(f"[INFO] gen_data: {case['name']} loaded existing key/counter") + else: + setup_case_rng(case) + value_max = np.iinfo(dtype).max + value_min = np.iinfo(dtype).min + key = np.random.randint(value_min, value_max + 1, size=2, dtype=dtype) + counter = np.random.randint(value_min, value_max + 1, size=4, dtype=dtype) + print(f"[INFO] gen_data: {case['name']} generated new key={key.tolist()} counter={counter.tolist()}") + + if os.path.exists(output_file): + golden = trandom_generate(key.view(np.uint32), counter.view(np.uint32), + valid_rows, valid_cols, dtype=dtype, rounds=rounds) + save_case_data(case["name"], {"key": key, "counter": counter, "golden": golden}) + print(f"[INFO] gen_data: {case['name']} generated golden shape={case['shape']}") + else: + save_case_data(case["name"], {"key": key, "counter": counter}) + print(f"[INFO] gen_data: {case['name']} saved inputs (waiting for output)") diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trandom/launch.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trandom/launch.cpp new file mode 100644 index 000000000..da3687167 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trandom/launch.cpp @@ -0,0 +1,28 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +#include + +#ifndef AICORE +#define AICORE [aicore] +#endif + +// Case smoke: ui32 1x256 + +extern "C" __global__ AICORE void TRANDOM_int32_1x256(__gm__ uint32_t *key, __gm__ uint32_t *counter, __gm__ uint32_t *output); +extern "C" __global__ AICORE void TRANDOM_int32_4x256(__gm__ uint32_t *key, __gm__ uint32_t *counter, __gm__ uint32_t *output); + +void LaunchTRANDOM_int32_1x256(uint32_t *key, uint32_t *counter, uint32_t *output, void *stream) { + TRANDOM_int32_1x256<<<1, nullptr, stream>>>((__gm__ uint32_t *)key, (__gm__ uint32_t *)counter, (__gm__ uint32_t *)output); +} + + + +void LaunchTRANDOM_int32_4x256(uint32_t *key, uint32_t *counter, uint32_t *output, void *stream) { + TRANDOM_int32_4x256<<<1, nullptr, stream>>>((__gm__ uint32_t *)key, (__gm__ uint32_t *)counter, (__gm__ uint32_t *)output); +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trandom/main.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trandom/main.cpp new file mode 100644 index 000000000..297e0ccd4 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trandom/main.cpp @@ -0,0 +1,139 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// Host driver for TileLang trandom ST — case-table driven. +// Each case launches a different kernel variant, reads key/counter and writes output. +// Numerical comparison is done externally by compare.py. + +#include "acl/acl.h" +#include "test_common.h" +#include +#include +#include +#include +#include +#include + +using namespace PtoTestCommon; + +// Kernel launch wrappers (defined in launch.cpp) +void LaunchTRANDOM_int32_1x256(uint32_t *key, uint32_t *counter, uint32_t *output, void *stream); +void LaunchTRANDOM_int32_4x256(uint32_t *key, uint32_t *counter, uint32_t *output, void *stream); + +struct TestCase { + const char *name; + void (*launch)(uint32_t *, uint32_t *, uint32_t *, void *); + size_t rows; + size_t cols; +}; + +static const TestCase kCases[] = { +{"int32_1x256", LaunchTRANDOM_int32_1x256, 1, 256}, +{"int32_4x256", LaunchTRANDOM_int32_4x256, 4, 256}, +}; +static constexpr size_t kNumCases = sizeof(kCases) / sizeof(kCases[0]); + +static int RunCase(const TestCase &tc, int deviceId, aclrtStream stream) { + int rc = 0; + const size_t elemCount = tc.rows * tc.cols; + const size_t outputSize = elemCount * sizeof(uint32_t); + size_t keySize = 2 * sizeof(uint32_t); + size_t counterSize = 4 * sizeof(uint32_t); + + std::printf("[INFO] === case: %s (shape=%zux%zu) ===\n", + tc.name, tc.rows, tc.cols); + + std::string caseDir = std::string("./") + tc.name; + + void *keyHost = nullptr, *counterHost = nullptr, *outputHost = nullptr; + void *keyDevice = nullptr, *counterDevice = nullptr, *outputDevice = nullptr; + + aclrtMallocHost(&keyHost, keySize); + aclrtMallocHost(&counterHost, counterSize); + aclrtMallocHost(&outputHost, outputSize); + + aclrtMalloc(&keyDevice, keySize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc(&counterDevice, counterSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc(&outputDevice, outputSize, ACL_MEM_MALLOC_HUGE_FIRST); + + if (!ReadFile((caseDir + "/key.bin").c_str(), keySize, keyHost, keySize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/key.bin\n", caseDir.c_str()); + rc = 1; + } + + if (!ReadFile((caseDir + "/counter.bin").c_str(), counterSize, counterHost, counterSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/counter.bin\n", caseDir.c_str()); + rc = 1; + } + + if (rc == 0) { + aclrtMemcpy(keyDevice, keySize, keyHost, keySize, ACL_MEMCPY_HOST_TO_DEVICE); + aclrtMemcpy(counterDevice, counterSize, counterHost, counterSize, ACL_MEMCPY_HOST_TO_DEVICE); + + tc.launch((uint32_t *)keyDevice, (uint32_t *)counterDevice, (uint32_t *)outputDevice, stream); + + aclrtSynchronizeStream(stream); + aclrtMemcpy(outputHost, outputSize, outputDevice, outputSize, ACL_MEMCPY_DEVICE_TO_HOST); + } + + if (rc == 0 && !WriteFile((caseDir + "/output.bin").c_str(), outputHost, outputSize)) { + std::fprintf(stderr, "[ERROR] failed to write %s/output.bin\n", caseDir.c_str()); + rc = 1; + } + + if (keyDevice != nullptr) + aclrtFree(keyDevice); + if (counterDevice != nullptr) + aclrtFree(counterDevice); + if (outputDevice != nullptr) + aclrtFree(outputDevice); + if (keyHost != nullptr) + aclrtFreeHost(keyHost); + if (counterHost != nullptr) + aclrtFreeHost(counterHost); + if (outputHost != nullptr) + aclrtFreeHost(outputHost); + + if (rc == 0) + std::printf("[INFO] case %s done\n", tc.name); + return rc; +} + +int main(int argc, char *argv[]) { + const char *caseFilter = (argc > 1) ? argv[1] : nullptr; + + int rc = 0; + int deviceId = 0; + aclrtStream stream = nullptr; + + aclInit(nullptr); + if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { + deviceId = std::atoi(envDevice); + } + aclrtSetDevice(deviceId); + aclrtCreateStream(&stream); + + for (size_t i = 0; i < kNumCases; ++i) { + if (caseFilter != nullptr && std::strcmp(kCases[i].name, caseFilter) != 0) { + continue; + } + int ret = RunCase(kCases[i], deviceId, stream); + if (ret != 0) { + std::fprintf(stderr, "[ERROR] case %s failed\n", kCases[i].name); + rc = 1; + break; + } + } + + if (stream != nullptr) + aclrtDestroyStream(stream); + aclrtResetDevice(deviceId); + aclFinalize(); + + return rc; +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trandom/trandom.pto b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trandom/trandom.pto new file mode 100644 index 000000000..45cff0c6b --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trandom/trandom.pto @@ -0,0 +1,106 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// TileLang ST kernels for pto.trandom: generate random numbers using key and counter. +// Compiled by ptoas --enable-insert-sync --enable-tile-op-expand --pto-backend=vpto +// to produce a fatobj object. + +module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { + // Smoke case: ui32 1x256 (256 elements, valid=1x256) + func.func @TRANDOM_int32_1x256(%key_ptr: !pto.ptr, %counter_ptr: !pto.ptr, %output_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c2 = arith.constant 2 : index + %c3 = arith.constant 3 : index + %c256 = arith.constant 256 : index + + %key0_ui32 = pto.load_scalar %key_ptr[%c0] : !pto.ptr -> ui32 + %key1_ui32 = pto.load_scalar %key_ptr[%c1] : !pto.ptr -> ui32 + %counter0_ui32 = pto.load_scalar %counter_ptr[%c0] : !pto.ptr -> ui32 + %counter1_ui32 = pto.load_scalar %counter_ptr[%c1] : !pto.ptr -> ui32 + %counter2_ui32 = pto.load_scalar %counter_ptr[%c2] : !pto.ptr -> ui32 + %counter3_ui32 = pto.load_scalar %counter_ptr[%c3] : !pto.ptr -> ui32 + + %key0 = builtin.unrealized_conversion_cast %key0_ui32 : ui32 to i32 + %key1 = builtin.unrealized_conversion_cast %key1_ui32 : ui32 to i32 + %counter0 = builtin.unrealized_conversion_cast %counter0_ui32 : ui32 to i32 + %counter1 = builtin.unrealized_conversion_cast %counter1_ui32 : ui32 to i32 + %counter2 = builtin.unrealized_conversion_cast %counter2_ui32 : ui32 to i32 + %counter3 = builtin.unrealized_conversion_cast %counter3_ui32 : ui32 to i32 + + %output_view = pto.make_tensor_view %output_ptr, + shape = [%c1, %c1, %c1, %c1, %c256], + strides = [%c256, %c256, %c256, %c256, %c1] + : !pto.tensor_view<1x1x1x1x256xui32> + + %output_part = pto.partition_view %output_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c1, %c256] + : !pto.tensor_view<1x1x1x1x256xui32> -> !pto.partition_tensor_view<1x1x1x1x256xui32> + + %dst = pto.alloc_tile + : !pto.tile_buf + + pto.trandom ins(%key0, %key1, %counter0, %counter1, %counter2, %counter3 : i32, i32, i32, i32, i32, i32) + outs(%dst : !pto.tile_buf) + + pto.tstore ins(%dst : !pto.tile_buf) + outs(%output_part : !pto.partition_tensor_view<1x1x1x1x256xui32>) + return + } + + // Case 0: ui32 4x256 (1024 elements, valid=4x256) + // Key and counter passed as ui32 arrays, converted to i32 for pto.trandom (which requires signless) + + func.func @TRANDOM_int32_4x256(%key_ptr: !pto.ptr, %counter_ptr: !pto.ptr, %output_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c2 = arith.constant 2 : index + %c3 = arith.constant 3 : index + %c4 = arith.constant 4 : index + %c256 = arith.constant 256 : index + %c1024 = arith.constant 1024 : index + + // Load key and counter values as ui32 + %key0_ui32 = pto.load_scalar %key_ptr[%c0] : !pto.ptr -> ui32 + %key1_ui32 = pto.load_scalar %key_ptr[%c1] : !pto.ptr -> ui32 + %counter0_ui32 = pto.load_scalar %counter_ptr[%c0] : !pto.ptr -> ui32 + %counter1_ui32 = pto.load_scalar %counter_ptr[%c1] : !pto.ptr -> ui32 + %counter2_ui32 = pto.load_scalar %counter_ptr[%c2] : !pto.ptr -> ui32 + %counter3_ui32 = pto.load_scalar %counter_ptr[%c3] : !pto.ptr -> ui32 + + // Convert ui32 to i32 (signless) before passing to pto.trandom + %key0 = builtin.unrealized_conversion_cast %key0_ui32 : ui32 to i32 + %key1 = builtin.unrealized_conversion_cast %key1_ui32 : ui32 to i32 + %counter0 = builtin.unrealized_conversion_cast %counter0_ui32 : ui32 to i32 + %counter1 = builtin.unrealized_conversion_cast %counter1_ui32 : ui32 to i32 + %counter2 = builtin.unrealized_conversion_cast %counter2_ui32 : ui32 to i32 + %counter3 = builtin.unrealized_conversion_cast %counter3_ui32 : ui32 to i32 + + %output_view = pto.make_tensor_view %output_ptr, + shape = [%c1, %c1, %c1, %c4, %c256], + strides = [%c1024, %c1024, %c1024, %c256, %c1] + : !pto.tensor_view<1x1x1x4x256xui32> + + %output_part = pto.partition_view %output_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c4, %c256] + : !pto.tensor_view<1x1x1x4x256xui32> -> !pto.partition_tensor_view<1x1x1x4x256xui32> + + %dst = pto.alloc_tile + : !pto.tile_buf + + // Input 6 scalars are i32 (signless), output tile is ui32 + pto.trandom ins(%key0, %key1, %counter0, %counter1, %counter2, %counter3 : i32, i32, i32, i32, i32, i32) + outs(%dst : !pto.tile_buf) + + pto.tstore ins(%dst : !pto.tile_buf) + outs(%output_part : !pto.partition_tensor_view<1x1x1x4x256xui32>) + return + } +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trecip/CMakeLists.txt b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trecip/CMakeLists.txt new file mode 100644 index 000000000..9ec69bc60 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trecip/CMakeLists.txt @@ -0,0 +1,9 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +pto_tilelang_vec_st(trecip) diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trecip/cases.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trecip/cases.py new file mode 100644 index 000000000..ecaf6a5f9 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trecip/cases.py @@ -0,0 +1,76 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +"""Single source of truth for trecip ST test cases. + +Each case defines: + - name: case identifier, used as subdirectory name and by main.cpp kCases[]. + - dtype: numpy dtype (e.g. np.float32). + - shape: (rows, cols) — allocated tile dimensions. + - valid_shape: (valid_rows, valid_cols) — effective computation region. + - eps: tolerance for numpy.allclose (atol and rtol). + +gen_data.py and compare.py both import this list to avoid redundant definitions. +""" + +import numpy as np + +CASES = [ + { + "name": "f32_16x64", + "dtype": np.float32, + "shape": (16, 64), + "valid_shape": (16, 64), + "eps": 1e-6, + }, + { + "name": "f32_32x32", + "dtype": np.float32, + "shape": (32, 32), + "valid_shape": (32, 32), + "eps": 1e-6, + }, + { + "name": "f16_16x64", + "dtype": np.float16, + "shape": (16, 64), + "valid_shape": (16, 64), + "eps": 1e-3, + }, + { + "name": "f16_32x32", + "dtype": np.float16, + "shape": (32, 32), + "valid_shape": (32, 32), + "eps": 1e-3, + }, + { + "name": "f32_64x64_pad", + "dtype": np.float32, + "shape": (66, 72), + "valid_shape": (64, 64), + "eps": 1e-6, + }, + { + "name": "f32_58x70", + "dtype": np.float32, + "shape": (66, 72), + "valid_shape": (58, 70), + "eps": 1e-6, + }, +] + +_SMOKE_CASE_NAMES = ['f32_16x64', 'f16_16x64'] +_SMOKE_CASE_NAME_SET = set(_SMOKE_CASE_NAMES) +_missing = [name for name in _SMOKE_CASE_NAMES if name not in {case["name"] for case in CASES}] +if _missing: + raise RuntimeError("unknown smoke case(s): " + ", ".join(_missing)) +CASES = [case for case in CASES if case["name"] in _SMOKE_CASE_NAME_SET] diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trecip/compare.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trecip/compare.py new file mode 100644 index 000000000..428604929 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trecip/compare.py @@ -0,0 +1,49 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import os +import sys +import numpy as np + +from cases import CASES +from st_common import result_cmp, style_fail, style_pass, validate_cases + + +def main(): + validate_cases(CASES) + case_filter = sys.argv[1] if len(sys.argv) > 1 else None + + all_passed = True + for case in CASES: + if case_filter is not None and case["name"] != case_filter: + continue + + case_dir = case["name"] + shape = case["shape"] + vr, vc = case["valid_shape"] + + golden = np.fromfile(os.path.join(case_dir, "golden.bin"), dtype=case["dtype"]).reshape(shape) + output = np.fromfile(os.path.join(case_dir, "output.bin"), dtype=case["dtype"]).reshape(shape) + + ok = result_cmp(golden[:vr, :vc], output[:vr, :vc], case["eps"]) + if ok: + print(style_pass(f"[INFO] {case['name']}: compare passed")) + else: + print(style_fail(f"[ERROR] {case['name']}: compare failed")) + all_passed = False + + if not all_passed: + sys.exit(2) + print(style_pass("[INFO] all cases passed")) + + +if __name__ == "__main__": + main() diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trecip/gen_data.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trecip/gen_data.py new file mode 100644 index 000000000..81e052958 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trecip/gen_data.py @@ -0,0 +1,31 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import numpy as np +from cases import CASES +from st_common import validate_cases, setup_case_rng, save_case_data + +validate_cases(CASES) + +for case in CASES: + setup_case_rng(case) + dtype = case["dtype"] + shape = case["shape"] + valid_shape = case["valid_shape"] + + # Avoid 0 for reciprocal, use range [0.1, 10.0] + input = np.random.uniform(0.1, 10.0, size=shape).astype(dtype) + + # reciprocal = 1/x + golden = np.reciprocal(input).astype(dtype, copy=False) + + save_case_data(case["name"], {"input": input, "golden": golden}) + print(f"[INFO] gen_data: {case['name']} shape={shape} valid_shape={valid_shape} dtype={dtype.__name__}") diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trecip/launch.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trecip/launch.cpp new file mode 100644 index 000000000..e39019bbd --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trecip/launch.cpp @@ -0,0 +1,28 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +#include + +#ifndef AICORE +#define AICORE [aicore] +#endif + +// Case 0: f32 16x64 + +extern "C" __global__ AICORE void TRECIP_f32_16x64(__gm__ float *a, __gm__ float *b); +extern "C" __global__ AICORE void TRECIP_f16_16x64(__gm__ uint16_t *a, __gm__ uint16_t *b); + +void LaunchTRECIP_f32_16x64(void *a, void *b, void *stream) { + TRECIP_f32_16x64<<<1, nullptr, stream>>>((__gm__ float *)a, (__gm__ float *)b); +} + + + +void LaunchTRECIP_f16_16x64(void *a, void *b, void *stream) { + TRECIP_f16_16x64<<<1, nullptr, stream>>>((__gm__ uint16_t *)a, (__gm__ uint16_t *)b); +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trecip/main.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trecip/main.cpp new file mode 100644 index 000000000..06135bc30 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trecip/main.cpp @@ -0,0 +1,136 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// Host driver for TileLang trecip ST — case-table driven. +// Each case launches a different kernel variant, reads/writes from per-case subdirectory. +// Numerical comparison is done externally by compare.py. + +#include "acl/acl.h" +#include "test_common.h" +#include +#include +#include +#include +#include +#include + +using namespace PtoTestCommon; + +// Kernel launch wrappers (defined in launch.cpp) +void LaunchTRECIP_f32_16x64(void *a, void *b, void *stream); +void LaunchTRECIP_f32_32x32(void *a, void *b, void *stream); +void LaunchTRECIP_f16_16x64(void *a, void *b, void *stream); +void LaunchTRECIP_f16_32x32(void *a, void *b, void *stream); +void LaunchTRECIP_f32_58x70(void *a, void *b, void *stream); + +using LaunchFn = void (*)(void *, void *, void *); + +struct TestCase { + const char *name; + LaunchFn launch; + size_t rows; // allocated tile rows + size_t cols; // allocated tile cols + size_t validRows; // effective computation rows (<= rows) + size_t validCols; // effective computation cols (<= cols) + size_t elemSize; // bytes per element +}; + +static const TestCase kCases[] = { +{"f32_16x64", LaunchTRECIP_f32_16x64, 16, 64, 16, 64, sizeof(float)}, +{"f16_16x64", LaunchTRECIP_f16_16x64, 16, 64, 16, 64, sizeof(uint16_t)}, +}; +static constexpr size_t kNumCases = sizeof(kCases) / sizeof(kCases[0]); + +static int RunCase(const TestCase &tc, int deviceId, aclrtStream stream) { + int rc = 0; + const size_t elemCount = tc.rows * tc.cols; + const size_t fileSize = elemCount * tc.elemSize; + + std::printf("[INFO] === case: %s (shape=%zux%zu, valid=%zux%zu) ===\n", + tc.name, tc.rows, tc.cols, tc.validRows, tc.validCols); + + // Per-case data directory + std::string caseDir = std::string("./") + tc.name; + size_t srcFileSize = fileSize; + + void *srcHost = nullptr, *dstHost = nullptr; + void *srcDevice = nullptr, *dstDevice = nullptr; + + aclrtMallocHost(&srcHost, fileSize); + aclrtMallocHost(&dstHost, fileSize); + + aclrtMalloc(&srcDevice, fileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc(&dstDevice, fileSize, ACL_MEM_MALLOC_HUGE_FIRST); + + if (!ReadFile((caseDir + "/input.bin").c_str(), srcFileSize, srcHost, fileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input.bin\n", caseDir.c_str()); + rc = 1; + } + + if (rc == 0) { + aclrtMemcpy(srcDevice, fileSize, srcHost, fileSize, ACL_MEMCPY_HOST_TO_DEVICE); + + tc.launch(srcDevice, dstDevice, stream); + + aclrtSynchronizeStream(stream); + aclrtMemcpy(dstHost, fileSize, dstDevice, fileSize, ACL_MEMCPY_DEVICE_TO_HOST); + } + + if (rc == 0 && !WriteFile((caseDir + "/output.bin").c_str(), dstHost, fileSize)) { + std::fprintf(stderr, "[ERROR] failed to write %s/output.bin\n", caseDir.c_str()); + rc = 1; + } + + if (srcDevice != nullptr) + aclrtFree(srcDevice); + if (dstDevice != nullptr) + aclrtFree(dstDevice); + if (srcHost != nullptr) + aclrtFreeHost(srcHost); + if (dstHost != nullptr) + aclrtFreeHost(dstHost); + + if (rc == 0) + std::printf("[INFO] case %s done\n", tc.name); + return rc; +} + +int main(int argc, char *argv[]) { + // Optional case filter: ./trecip [case_name] + const char *caseFilter = (argc > 1) ? argv[1] : nullptr; + + int rc = 0; + int deviceId = 0; + aclrtStream stream = nullptr; + + aclInit(nullptr); + if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { + deviceId = std::atoi(envDevice); + } + aclrtSetDevice(deviceId); + aclrtCreateStream(&stream); + + for (size_t i = 0; i < kNumCases; ++i) { + if (caseFilter != nullptr && std::strcmp(kCases[i].name, caseFilter) != 0) { + continue; + } + int ret = RunCase(kCases[i], deviceId, stream); + if (ret != 0) { + std::fprintf(stderr, "[ERROR] case %s failed\n", kCases[i].name); + rc = 1; + break; + } + } + + if (stream != nullptr) + aclrtDestroyStream(stream); + aclrtResetDevice(deviceId); + aclFinalize(); + + return rc; +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trecip/trecip.pto b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trecip/trecip.pto new file mode 100644 index 000000000..f2b95b4d5 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trecip/trecip.pto @@ -0,0 +1,102 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// TileLang ST kernels for pto.trecip: 1/x (reciprocal) +// trecip = vdiv(1.0, x) +// Multiple cases with different shapes in a single module. +// Compiled by ptoas --enable-insert-sync --enable-tile-op-expand --pto-backend=vpto +// to produce a fatobj object. + +module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { + // Case 0: f32 16x64 + func.func @TRECIP_f32_16x64(%a_ptr: !pto.ptr, %b_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c16 = arith.constant 16 : index + %c64 = arith.constant 64 : index + %c1024 = arith.constant 1024 : index + + %a_view = pto.make_tensor_view %a_ptr, + shape = [%c1, %c1, %c1, %c16, %c64], + strides = [%c1024, %c1024, %c1024, %c64, %c1] + : !pto.tensor_view<1x1x1x16x64xf32> + %b_view = pto.make_tensor_view %b_ptr, + shape = [%c1, %c1, %c1, %c16, %c64], + strides = [%c1024, %c1024, %c1024, %c64, %c1] + : !pto.tensor_view<1x1x1x16x64xf32> + + %a_part = pto.partition_view %a_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c16, %c64] + : !pto.tensor_view<1x1x1x16x64xf32> -> !pto.partition_tensor_view<1x1x1x16x64xf32> + %b_part = pto.partition_view %b_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c16, %c64] + : !pto.tensor_view<1x1x1x16x64xf32> -> !pto.partition_tensor_view<1x1x1x16x64xf32> + + %a = pto.alloc_tile + : !pto.tile_buf + %b = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%a_part : !pto.partition_tensor_view<1x1x1x16x64xf32>) + outs(%a : !pto.tile_buf) + + pto.trecip ins(%a : !pto.tile_buf) + outs(%b : !pto.tile_buf) + + pto.tstore ins(%b : !pto.tile_buf) + outs(%b_part : !pto.partition_tensor_view<1x1x1x16x64xf32>) + return + } + + // Case 1: f32 32x32 + + func.func @TRECIP_f16_16x64(%a_ptr: !pto.ptr, %b_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c16 = arith.constant 16 : index + %c64 = arith.constant 64 : index + %c1024 = arith.constant 1024 : index + + %a_view = pto.make_tensor_view %a_ptr, + shape = [%c1, %c1, %c1, %c16, %c64], + strides = [%c1024, %c1024, %c1024, %c64, %c1] + : !pto.tensor_view<1x1x1x16x64xf16> + %b_view = pto.make_tensor_view %b_ptr, + shape = [%c1, %c1, %c1, %c16, %c64], + strides = [%c1024, %c1024, %c1024, %c64, %c1] + : !pto.tensor_view<1x1x1x16x64xf16> + + %a_part = pto.partition_view %a_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c16, %c64] + : !pto.tensor_view<1x1x1x16x64xf16> -> !pto.partition_tensor_view<1x1x1x16x64xf16> + %b_part = pto.partition_view %b_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c16, %c64] + : !pto.tensor_view<1x1x1x16x64xf16> -> !pto.partition_tensor_view<1x1x1x16x64xf16> + + %a = pto.alloc_tile + : !pto.tile_buf + %b = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%a_part : !pto.partition_tensor_view<1x1x1x16x64xf16>) + outs(%a : !pto.tile_buf) + + pto.trecip ins(%a : !pto.tile_buf) + outs(%b : !pto.tile_buf) + + pto.tstore ins(%b : !pto.tile_buf) + outs(%b_part : !pto.partition_tensor_view<1x1x1x16x64xf16>) + return + } + + // Case 3: f16 32x32 +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trelu/CMakeLists.txt b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trelu/CMakeLists.txt new file mode 100644 index 000000000..3c012db8f --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trelu/CMakeLists.txt @@ -0,0 +1,9 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +pto_tilelang_vec_st(trelu) diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trelu/cases.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trelu/cases.py new file mode 100644 index 000000000..65765e7b5 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trelu/cases.py @@ -0,0 +1,59 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +"""Single source of truth for trelu ST test cases. + +Each case defines: + - name: case identifier, used as subdirectory name and by main.cpp kCases[]. + - dtype: numpy dtype (e.g. np.float32). + - shape: (rows, cols) — global data dimensions (input/output size). + - tile_shape: (tile_rows, tile_cols) — allocated tile buffer dimensions. + - valid_shape: (valid_rows, valid_cols) — effective computation region. + - eps: tolerance for numpy.allclose (atol and rtol). + +gen_data.py and compare.py both import this list to avoid redundant definitions. +""" + +import numpy as np + +CASES = [ + { + "name": "f16_64x64_valid_60x60", + "dtype": np.float16, + "shape": (60, 60), + "tile_shape": (64, 64), + "valid_shape": (60, 60), + "eps": 1e-3, + }, + { + "name": "int32_64x64", + "dtype": np.int32, + "shape": (64, 64), + "tile_shape": (64, 64), + "valid_shape": (64, 64), + "eps": 1e-6, + }, + { + "name": "f32_64x64_valid_60x60", + "dtype": np.float32, + "shape": (60, 60), + "tile_shape": (64, 64), + "valid_shape": (60, 60), + "eps": 1e-6, + }, +] + +_SMOKE_CASE_NAMES = ['f16_64x64_valid_60x60', 'f32_64x64_valid_60x60'] +_SMOKE_CASE_NAME_SET = set(_SMOKE_CASE_NAMES) +_missing = [name for name in _SMOKE_CASE_NAMES if name not in {case["name"] for case in CASES}] +if _missing: + raise RuntimeError("unknown smoke case(s): " + ", ".join(_missing)) +CASES = [case for case in CASES if case["name"] in _SMOKE_CASE_NAME_SET] diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trelu/compare.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trelu/compare.py new file mode 100644 index 000000000..ffd25f89d --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trelu/compare.py @@ -0,0 +1,48 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import os +import sys +import numpy as np + +from cases import CASES +from st_common import result_cmp, style_fail, style_pass, validate_cases + + +def main(): + validate_cases(CASES) + case_filter = sys.argv[1] if len(sys.argv) > 1 else None + + all_passed = True + for case in CASES: + if case_filter is not None and case["name"] != case_filter: + continue + + case_dir = case["name"] + shape = case["shape"] + + golden = np.fromfile(os.path.join(case_dir, "golden.bin"), dtype=case["dtype"]).reshape(shape) + output = np.fromfile(os.path.join(case_dir, "output.bin"), dtype=case["dtype"]).reshape(shape) + + ok = result_cmp(golden, output, case["eps"]) + if ok: + print(style_pass(f"[INFO] {case['name']}: compare passed")) + else: + print(style_fail(f"[ERROR] {case['name']}: compare failed")) + all_passed = False + + if not all_passed: + sys.exit(2) + print(style_pass("[INFO] all cases passed")) + + +if __name__ == "__main__": + main() diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trelu/gen_data.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trelu/gen_data.py new file mode 100644 index 000000000..ea9483b2c --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trelu/gen_data.py @@ -0,0 +1,32 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import numpy as np +from cases import CASES +from st_common import validate_cases, setup_case_rng, save_case_data + +validate_cases(CASES) + +for case in CASES: + setup_case_rng(case) + + dtype = case["dtype"] + shape = case["shape"] + + if dtype == np.int32: + input1 = np.random.randint(-3_000_000, 3_000_000, size=shape).astype(dtype) + else: + input1 = np.random.uniform(-10, 10, size=shape).astype(dtype) + + golden = np.maximum(input1, 0) + + save_case_data(case["name"], {"input": input1, "golden": golden}) + print(f"[INFO] gen_data: {case['name']} shape={shape} valid_shape={case['valid_shape']} dtype={dtype.__name__}") diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trelu/launch.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trelu/launch.cpp new file mode 100644 index 000000000..567bfe231 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trelu/launch.cpp @@ -0,0 +1,26 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +#include + +#ifndef AICORE +#define AICORE [aicore] +#endif + +// Case 0: int32 64x64 + +extern "C" __global__ AICORE void TRELU_f16_64x64_v60x60(__gm__ uint16_t *input, __gm__ uint16_t *output); +extern "C" __global__ AICORE void TRELU_f32_64x64_v60x60(__gm__ float *input, __gm__ float *output); + +void LaunchTRELU_f32_64x64_v60x60(float *input, float *output, void *stream) { + TRELU_f32_64x64_v60x60<<<1, nullptr, stream>>>((__gm__ float *)input, (__gm__ float *)output); +} + +void LaunchTRELU_f16_64x64_v60x60(uint16_t *input, uint16_t *output, void *stream) { + TRELU_f16_64x64_v60x60<<<1, nullptr, stream>>>((__gm__ uint16_t *)input, (__gm__ uint16_t *)output); +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trelu/main.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trelu/main.cpp new file mode 100644 index 000000000..a6f8a0a15 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trelu/main.cpp @@ -0,0 +1,126 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// Host driver for TileLang trelu ST — case-table driven. +// Each case launches a different kernel variant, reads/writes from per-case subdirectory. +// Numerical comparison is done externally by compare.py. + +#include "acl/acl.h" +#include "test_common.h" +#include +#include +#include +#include +#include +#include + +using namespace PtoTestCommon; + +// Kernel launch wrappers (defined in launch.cpp) +void LaunchTRELU_f16_64x64_v60x60(uint16_t *input, uint16_t *output, void *stream); +void LaunchTRELU_f32_64x64_v60x60(float *input, float *output, void *stream); + +struct TestCase { + const char *name; + void (*launch)(void *, void *, void *); + size_t rows; + size_t cols; + size_t elemSize; +}; + +static const TestCase kCases[] = { +{"f16_64x64_valid_60x60", (void (*)(void*, void*, void*))LaunchTRELU_f16_64x64_v60x60, 60, 60, sizeof(uint16_t)}, +{"f32_64x64_valid_60x60", (void (*)(void*, void*, void*))LaunchTRELU_f32_64x64_v60x60, 60, 60, sizeof(float)}, +}; +static constexpr size_t kNumCases = sizeof(kCases) / sizeof(kCases[0]); + +static int RunCase(const TestCase &tc, int deviceId, aclrtStream stream) { + int rc = 0; + const size_t elemCount = tc.rows * tc.cols; + size_t fileSize = elemCount * tc.elemSize; + + std::printf("[INFO] === case: %s (shape=%zux%zu) ===\n", + tc.name, tc.rows, tc.cols); + + std::string caseDir = std::string("./") + tc.name; + + void *inputHost = nullptr, *outputHost = nullptr; + void *inputDevice = nullptr, *outputDevice = nullptr; + + aclrtMallocHost(&inputHost, fileSize); + aclrtMallocHost(&outputHost, fileSize); + + aclrtMalloc(&inputDevice, fileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc(&outputDevice, fileSize, ACL_MEM_MALLOC_HUGE_FIRST); + + if (!ReadFile((caseDir + "/input.bin").c_str(), fileSize, inputHost, fileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input.bin\n", caseDir.c_str()); + rc = 1; + } + + if (rc == 0) { + aclrtMemcpy(inputDevice, fileSize, inputHost, fileSize, ACL_MEMCPY_HOST_TO_DEVICE); + + tc.launch(inputDevice, outputDevice, stream); + + aclrtSynchronizeStream(stream); + aclrtMemcpy(outputHost, fileSize, outputDevice, fileSize, ACL_MEMCPY_DEVICE_TO_HOST); + } + + if (rc == 0 && !WriteFile((caseDir + "/output.bin").c_str(), outputHost, fileSize)) { + std::fprintf(stderr, "[ERROR] failed to write %s/output.bin\n", caseDir.c_str()); + rc = 1; + } + + if (inputDevice != nullptr) + aclrtFree(inputDevice); + if (outputDevice != nullptr) + aclrtFree(outputDevice); + if (inputHost != nullptr) + aclrtFreeHost(inputHost); + if (outputHost != nullptr) + aclrtFreeHost(outputHost); + + if (rc == 0) + std::printf("[INFO] case %s done\n", tc.name); + return rc; +} + +int main(int argc, char *argv[]) { + const char *caseFilter = (argc > 1) ? argv[1] : nullptr; + + int rc = 0; + int deviceId = 0; + aclrtStream stream = nullptr; + + aclInit(nullptr); + if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { + deviceId = std::atoi(envDevice); + } + aclrtSetDevice(deviceId); + aclrtCreateStream(&stream); + + for (size_t i = 0; i < kNumCases; ++i) { + if (caseFilter != nullptr && std::strcmp(kCases[i].name, caseFilter) != 0) { + continue; + } + int ret = RunCase(kCases[i], deviceId, stream); + if (ret != 0) { + std::fprintf(stderr, "[ERROR] case %s failed\n", kCases[i].name); + rc = 1; + break; + } + } + + if (stream != nullptr) + aclrtDestroyStream(stream); + aclrtResetDevice(deviceId); + aclFinalize(); + + return rc; +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trelu/trelu.pto b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trelu/trelu.pto new file mode 100644 index 000000000..b1dd97010 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trelu/trelu.pto @@ -0,0 +1,99 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// TileLang ST kernels for pto.trelu: tload(input) + trelu(input)->output + tstore(output). +// Multiple cases with different shapes and dtypes in a single module. +// Compiled by ptoas --enable-insert-sync --enable-tile-op-expand --pto-backend=vpto +// to produce a fatobj object. + +module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { + // Case 0: int32 64x64 (4096 elements, valid=64x64) + func.func @TRELU_f16_64x64_v60x60(%input_ptr: !pto.ptr, %output_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c60 = arith.constant 60 : index + %c64 = arith.constant 64 : index + %c3600 = arith.constant 3600 : index + + %input_view = pto.make_tensor_view %input_ptr, + shape = [%c1, %c1, %c1, %c60, %c60], + strides = [%c3600, %c3600, %c3600, %c60, %c1] + : !pto.tensor_view<1x1x1x60x60xf16> + %output_view = pto.make_tensor_view %output_ptr, + shape = [%c1, %c1, %c1, %c60, %c60], + strides = [%c3600, %c3600, %c3600, %c60, %c1] + : !pto.tensor_view<1x1x1x60x60xf16> + + %input_part = pto.partition_view %input_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c60, %c60] + : !pto.tensor_view<1x1x1x60x60xf16> -> !pto.partition_tensor_view<1x1x1x60x60xf16> + %output_part = pto.partition_view %output_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c60, %c60] + : !pto.tensor_view<1x1x1x60x60xf16> -> !pto.partition_tensor_view<1x1x1x60x60xf16> + + %src = pto.alloc_tile + : !pto.tile_buf + %dst = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%input_part : !pto.partition_tensor_view<1x1x1x60x60xf16>) + outs(%src : !pto.tile_buf) + + pto.trelu ins(%src : !pto.tile_buf) + outs(%dst : !pto.tile_buf) + + pto.tstore ins(%dst : !pto.tile_buf) + outs(%output_part : !pto.partition_tensor_view<1x1x1x60x60xf16>) + return + } + + // Case 2: f32 64x64 (4096 elements, valid=60x60) + + func.func @TRELU_f32_64x64_v60x60(%input_ptr: !pto.ptr, %output_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c60 = arith.constant 60 : index + %c64 = arith.constant 64 : index + %c3600 = arith.constant 3600 : index + + %input_view = pto.make_tensor_view %input_ptr, + shape = [%c1, %c1, %c1, %c60, %c60], + strides = [%c3600, %c3600, %c3600, %c60, %c1] + : !pto.tensor_view<1x1x1x60x60xf32> + %output_view = pto.make_tensor_view %output_ptr, + shape = [%c1, %c1, %c1, %c60, %c60], + strides = [%c3600, %c3600, %c3600, %c60, %c1] + : !pto.tensor_view<1x1x1x60x60xf32> + + %input_part = pto.partition_view %input_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c60, %c60] + : !pto.tensor_view<1x1x1x60x60xf32> -> !pto.partition_tensor_view<1x1x1x60x60xf32> + %output_part = pto.partition_view %output_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c60, %c60] + : !pto.tensor_view<1x1x1x60x60xf32> -> !pto.partition_tensor_view<1x1x1x60x60xf32> + + %src = pto.alloc_tile + : !pto.tile_buf + %dst = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%input_part : !pto.partition_tensor_view<1x1x1x60x60xf32>) + outs(%src : !pto.tile_buf) + + pto.trelu ins(%src : !pto.tile_buf) + outs(%dst : !pto.tile_buf) + + pto.tstore ins(%dst : !pto.tile_buf) + outs(%output_part : !pto.partition_tensor_view<1x1x1x60x60xf32>) + return + } +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trem/CMakeLists.txt b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trem/CMakeLists.txt new file mode 100644 index 000000000..f6adeae87 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trem/CMakeLists.txt @@ -0,0 +1,9 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +pto_tilelang_vec_st(trem) diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trem/cases.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trem/cases.py new file mode 100644 index 000000000..fd479d3c3 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trem/cases.py @@ -0,0 +1,47 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +"""Single source of truth for trem ST test cases. + +Each case defines: + - name: case identifier, used as subdirectory name and by main.cpp kCases[]. + - dtype: numpy dtype (e.g. np.float32). + - shape: (rows, cols) — allocated tile dimensions. + - valid_shape: (valid_rows, valid_cols) — effective computation region. + - eps: tolerance for numpy.allclose (atol and rtol). + +gen_data.py and compare.py both import this list to avoid redundant definitions. +""" + +import numpy as np + +CASES = [ + { + "name": "f32_16x64", + "dtype": np.float32, + "shape": (16, 64), + "valid_shape": (16, 64), + "eps": 1e-6, + }, + { + "name": "f32_32x32", + "dtype": np.float32, + "shape": (32, 32), + "valid_shape": (32, 32), + "eps": 1e-6, + }, +] + +_SMOKE_CASE_NAMES = ['f32_16x64', 'f32_32x32'] +_SMOKE_CASE_NAME_SET = set(_SMOKE_CASE_NAMES) +_missing = [name for name in _SMOKE_CASE_NAMES if name not in {case["name"] for case in CASES}] +if _missing: + raise RuntimeError("unknown smoke case(s): " + ", ".join(_missing)) +CASES = [case for case in CASES if case["name"] in _SMOKE_CASE_NAME_SET] diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trem/compare.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trem/compare.py new file mode 100644 index 000000000..6a4d5d1aa --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trem/compare.py @@ -0,0 +1,48 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import os +import sys +import numpy as np + +from cases import CASES +from st_common import result_cmp, style_fail, style_pass, validate_cases + + +def main(): + validate_cases(CASES) + case_filter = sys.argv[1] if len(sys.argv) > 1 else None + + all_passed = True + for case in CASES: + if case_filter is not None and case["name"] != case_filter: + continue + + case_dir = case["name"] + shape = case["shape"] + vr, vc = case["valid_shape"] + + golden = np.fromfile(os.path.join(case_dir, "golden.bin"), dtype=case["dtype"]).reshape(shape) + output = np.fromfile(os.path.join(case_dir, "output.bin"), dtype=case["dtype"]).reshape(shape) + + ok = result_cmp(golden[:vr, :vc], output[:vr, :vc], case["eps"]) + if ok: + print(style_pass(f"[INFO] {case['name']}: compare passed")) + else: + print(style_fail(f"[ERROR] {case['name']}: compare failed")) + all_passed = False + + if not all_passed: + sys.exit(2) + print(style_pass("[INFO] all cases passed")) + + +if __name__ == "__main__": + main() diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trem/gen_data.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trem/gen_data.py new file mode 100644 index 000000000..243cbe408 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trem/gen_data.py @@ -0,0 +1,32 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import numpy as np +from cases import CASES +from st_common import validate_cases, setup_case_rng, save_case_data + +validate_cases(CASES) + +for case in CASES: + setup_case_rng(case) + + dtype = case["dtype"] + shape = case["shape"] + valid_shape = case["valid_shape"] + + input1 = np.random.randint(1, 10, size=shape).astype(dtype) + input2 = np.random.randint(1, 10, size=shape).astype(dtype) + + golden = np.zeros(shape, dtype=dtype) + vr, vc = valid_shape + golden[:vr, :vc] = np.remainder(input1[:vr, :vc], input2[:vr, :vc]) + + save_case_data(case["name"], {"input1": input1, "input2": input2, "golden": golden}) + print(f"[INFO] gen_data: {case['name']} shape={shape} valid_shape={valid_shape} dtype={dtype.__name__}") diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trem/launch.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trem/launch.cpp new file mode 100644 index 000000000..4d524d131 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trem/launch.cpp @@ -0,0 +1,28 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +#include + +#ifndef AICORE +#define AICORE [aicore] +#endif + +// Case 0: f32 16x64 + +extern "C" __global__ AICORE void TREM_f32_16x64(__gm__ float *a, __gm__ float *b, __gm__ float *c); +extern "C" __global__ AICORE void TREM_f32_32x32(__gm__ float *a, __gm__ float *b, __gm__ float *c); + +void LaunchTREM_f32_16x64(float *a, float *b, float *c, void *stream) { + TREM_f32_16x64<<<1, nullptr, stream>>>((__gm__ float *)a, (__gm__ float *)b, (__gm__ float *)c); +} + + + +void LaunchTREM_f32_32x32(float *a, float *b, float *c, void *stream) { + TREM_f32_32x32<<<1, nullptr, stream>>>((__gm__ float *)a, (__gm__ float *)b, (__gm__ float *)c); +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trem/main.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trem/main.cpp new file mode 100644 index 000000000..f689651f4 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trem/main.cpp @@ -0,0 +1,145 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// Host driver for TileLang tadd ST — case-table driven. +// Each case launches a different kernel variant, reads/writes from per-case subdirectory. +// Numerical comparison is done externally by compare.py. + +#include "acl/acl.h" +#include "test_common.h" +#include +#include +#include +#include +#include +#include + +using namespace PtoTestCommon; + +// Kernel launch wrappers (defined in launch.cpp) +void LaunchTREM_f32_16x64(float *a, float *b, float *c, void *stream); +void LaunchTREM_f32_32x32(float *a, float *b, float *c, void *stream); + +using LaunchFn = void (*)(float *, float *, float *, void *); + +struct TestCase { + const char *name; + LaunchFn launch; + size_t rows; // allocated tile rows + size_t cols; // allocated tile cols + size_t validRows; // effective computation rows (<= rows) + size_t validCols; // effective computation cols (<= cols) + size_t elemSize; // bytes per element +}; + +static const TestCase kCases[] = { +{"f32_16x64", LaunchTREM_f32_16x64, 16, 64, 16, 64, sizeof(float)}, +{"f32_32x32", LaunchTREM_f32_32x32, 32, 32, 32, 32, sizeof(float)}, +}; +static constexpr size_t kNumCases = sizeof(kCases) / sizeof(kCases[0]); + +static int RunCase(const TestCase &tc, int deviceId, aclrtStream stream) { + int rc = 0; + const size_t elemCount = tc.rows * tc.cols; + const size_t fileSize = elemCount * tc.elemSize; + + std::printf("[INFO] === case: %s (shape=%zux%zu, valid=%zux%zu) ===\n", + tc.name, tc.rows, tc.cols, tc.validRows, tc.validCols); + + // Per-case data directory + std::string caseDir = std::string("./") + tc.name; + size_t src0FileSize = fileSize; + size_t src1FileSize = fileSize; + + float *src0Host = nullptr, *src1Host = nullptr, *dstHost = nullptr; + float *src0Device = nullptr, *src1Device = nullptr, *dstDevice = nullptr; + + aclrtMallocHost((void **)(&src0Host), fileSize); + aclrtMallocHost((void **)(&src1Host), fileSize); + aclrtMallocHost((void **)(&dstHost), fileSize); + + aclrtMalloc((void **)&src0Device, fileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc((void **)&src1Device, fileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc((void **)&dstDevice, fileSize, ACL_MEM_MALLOC_HUGE_FIRST); + + if (!ReadFile((caseDir + "/input1.bin").c_str(), src0FileSize, src0Host, fileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input1.bin\n", caseDir.c_str()); + rc = 1; + } + if (rc == 0 && !ReadFile((caseDir + "/input2.bin").c_str(), src1FileSize, src1Host, fileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input2.bin\n", caseDir.c_str()); + rc = 1; + } + + if (rc == 0) { + aclrtMemcpy(src0Device, fileSize, src0Host, fileSize, ACL_MEMCPY_HOST_TO_DEVICE); + aclrtMemcpy(src1Device, fileSize, src1Host, fileSize, ACL_MEMCPY_HOST_TO_DEVICE); + + tc.launch(src0Device, src1Device, dstDevice, stream); + + aclrtSynchronizeStream(stream); + aclrtMemcpy(dstHost, fileSize, dstDevice, fileSize, ACL_MEMCPY_DEVICE_TO_HOST); + } + + if (rc == 0 && !WriteFile((caseDir + "/output.bin").c_str(), dstHost, fileSize)) { + std::fprintf(stderr, "[ERROR] failed to write %s/output.bin\n", caseDir.c_str()); + rc = 1; + } + + if (src0Device != nullptr) + aclrtFree(src0Device); + if (src1Device != nullptr) + aclrtFree(src1Device); + if (dstDevice != nullptr) + aclrtFree(dstDevice); + if (src0Host != nullptr) + aclrtFreeHost(src0Host); + if (src1Host != nullptr) + aclrtFreeHost(src1Host); + if (dstHost != nullptr) + aclrtFreeHost(dstHost); + + if (rc == 0) + std::printf("[INFO] case %s done\n", tc.name); + return rc; +} + +int main(int argc, char *argv[]) { + // Optional case filter: ./trem [case_name] + const char *caseFilter = (argc > 1) ? argv[1] : nullptr; + + int rc = 0; + int deviceId = 0; + aclrtStream stream = nullptr; + + aclInit(nullptr); + if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { + deviceId = std::atoi(envDevice); + } + aclrtSetDevice(deviceId); + aclrtCreateStream(&stream); + + for (size_t i = 0; i < kNumCases; ++i) { + if (caseFilter != nullptr && std::strcmp(kCases[i].name, caseFilter) != 0) { + continue; + } + int ret = RunCase(kCases[i], deviceId, stream); + if (ret != 0) { + std::fprintf(stderr, "[ERROR] case %s failed\n", kCases[i].name); + rc = 1; + break; + } + } + + if (stream != nullptr) + aclrtDestroyStream(stream); + aclrtResetDevice(deviceId); + aclFinalize(); + + return rc; +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trem/trem.pto b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trem/trem.pto new file mode 100644 index 000000000..7c0846e39 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trem/trem.pto @@ -0,0 +1,152 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// TileLang ST kernels for pto.tadd: tload(a) + tload(b) + tadd(a,b)->c + tstore(c). +// Multiple cases with different shapes in a single module. +// Compiled by ptoas --enable-insert-sync --enable-tile-op-expand --pto-backend=vpto +// to produce a fatobj object. + +module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { + // Case 0: f32 16x64 (1024 elements) + func.func @TREM_f32_16x64(%a_ptr: !pto.ptr, %b_ptr: !pto.ptr, %c_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c16 = arith.constant 16 : index + %c64 = arith.constant 64 : index + %c1024 = arith.constant 1024 : index + + %a_view = pto.make_tensor_view %a_ptr, + shape = [%c1, %c1, %c1, %c16, %c64], + strides = [%c1024, %c1024, %c1024, %c64, %c1] + : !pto.tensor_view<1x1x1x16x64xf32> + %b_view = pto.make_tensor_view %b_ptr, + shape = [%c1, %c1, %c1, %c16, %c64], + strides = [%c1024, %c1024, %c1024, %c64, %c1] + : !pto.tensor_view<1x1x1x16x64xf32> + %c_view = pto.make_tensor_view %c_ptr, + shape = [%c1, %c1, %c1, %c16, %c64], + strides = [%c1024, %c1024, %c1024, %c64, %c1] + : !pto.tensor_view<1x1x1x16x64xf32> + + %a_part = pto.partition_view %a_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c16, %c64] + : !pto.tensor_view<1x1x1x16x64xf32> -> !pto.partition_tensor_view<1x1x1x16x64xf32> + %b_part = pto.partition_view %b_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c16, %c64] + : !pto.tensor_view<1x1x1x16x64xf32> -> !pto.partition_tensor_view<1x1x1x16x64xf32> + %c_part = pto.partition_view %c_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c16, %c64] + : !pto.tensor_view<1x1x1x16x64xf32> -> !pto.partition_tensor_view<1x1x1x16x64xf32> + + %a = pto.alloc_tile + : !pto.tile_buf + %b = pto.alloc_tile + : !pto.tile_buf + %tmp = pto.alloc_tile + : !pto.tile_buf + %c = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%a_part : !pto.partition_tensor_view<1x1x1x16x64xf32>) + outs(%a : !pto.tile_buf) + pto.tload ins(%b_part : !pto.partition_tensor_view<1x1x1x16x64xf32>) + outs(%b : !pto.tile_buf) + + pto.trem ins(%a, %b, %tmp : !pto.tile_buf, + !pto.tile_buf, + !pto.tile_buf) + outs(%c : !pto.tile_buf) + + pto.tstore ins(%c : !pto.tile_buf) + outs(%c_part : !pto.partition_tensor_view<1x1x1x16x64xf32>) + return + } + + // Case 1: f32 32x32 (1024 elements) + + func.func @TREM_f32_32x32(%a_ptr: !pto.ptr, %b_ptr: !pto.ptr, %c_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c32 = arith.constant 32 : index + %c1024 = arith.constant 1024 : index + + %a_view = pto.make_tensor_view %a_ptr, + shape = [%c1, %c1, %c1, %c32, %c32], + strides = [%c1024, %c1024, %c1024, %c32, %c1] + : !pto.tensor_view<1x1x1x32x32xf32> + %b_view = pto.make_tensor_view %b_ptr, + shape = [%c1, %c1, %c1, %c32, %c32], + strides = [%c1024, %c1024, %c1024, %c32, %c1] + : !pto.tensor_view<1x1x1x32x32xf32> + %c_view = pto.make_tensor_view %c_ptr, + shape = [%c1, %c1, %c1, %c32, %c32], + strides = [%c1024, %c1024, %c1024, %c32, %c1] + : !pto.tensor_view<1x1x1x32x32xf32> + + %a_part = pto.partition_view %a_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c32, %c32] + : !pto.tensor_view<1x1x1x32x32xf32> -> !pto.partition_tensor_view<1x1x1x32x32xf32> + %b_part = pto.partition_view %b_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c32, %c32] + : !pto.tensor_view<1x1x1x32x32xf32> -> !pto.partition_tensor_view<1x1x1x32x32xf32> + %c_part = pto.partition_view %c_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c32, %c32] + : !pto.tensor_view<1x1x1x32x32xf32> -> !pto.partition_tensor_view<1x1x1x32x32xf32> + + %a = pto.alloc_tile + : !pto.tile_buf + %b = pto.alloc_tile + : !pto.tile_buf + %tmp = pto.alloc_tile + : !pto.tile_buf + %c = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%a_part : !pto.partition_tensor_view<1x1x1x32x32xf32>) + outs(%a : !pto.tile_buf) + pto.tload ins(%b_part : !pto.partition_tensor_view<1x1x1x32x32xf32>) + outs(%b : !pto.tile_buf) + + pto.trem ins(%a, %b, %tmp : !pto.tile_buf, + !pto.tile_buf, + !pto.tile_buf) + outs(%c : !pto.tile_buf) + + pto.tstore ins(%c : !pto.tile_buf) + outs(%c_part : !pto.partition_tensor_view<1x1x1x32x32xf32>) + return + } +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trems/CMakeLists.txt b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trems/CMakeLists.txt new file mode 100644 index 000000000..3a21d2c4c --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trems/CMakeLists.txt @@ -0,0 +1,9 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +pto_tilelang_vec_st(trems) diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trems/cases.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trems/cases.py new file mode 100644 index 000000000..1ad2f5007 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trems/cases.py @@ -0,0 +1,63 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +"""Single source of truth for trems ST test cases. + +trems: integer remainder via vdiv, dst = src - trunc(src/scalar) * scalar. +All types: f32, f16, i32, i16. +""" + +import numpy as np + +CASES = [ + { + "name": "f32_32x64", + "dtype": np.float32, + "shape": (32, 64), + "valid_shape": (32, 64), + "eps": 1e-6, + }, + { + "name": "f16_63x64", + "dtype": np.float16, + "shape": (63, 64), + "valid_shape": (63, 64), + "eps": 1e-3, + }, + { + "name": "f32_7x448", + "dtype": np.float32, + "shape": (7, 448), + "valid_shape": (7, 448), + "eps": 1e-6, + }, + { + "name": "f32_256x16", + "dtype": np.float32, + "shape": (256, 16), + "valid_shape": (256, 16), + "eps": 1e-6, + }, +] + +_SMOKE_CASE_NAMES = ['f32_32x64', 'f16_63x64'] +_SMOKE_CASE_NAME_SET = set(_SMOKE_CASE_NAMES) +_missing = [name for name in _SMOKE_CASE_NAMES if name not in {case["name"] for case in CASES}] +if _missing: + raise RuntimeError("unknown smoke case(s): " + ", ".join(_missing)) +CASES = [case for case in CASES if case["name"] in _SMOKE_CASE_NAME_SET] diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trems/compare.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trems/compare.py new file mode 100644 index 000000000..18835ae9f --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trems/compare.py @@ -0,0 +1,56 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import os +import sys +import numpy as np + +from cases import CASES +from st_common import result_cmp, style_fail, style_pass, validate_cases + + +def main(): + validate_cases(CASES) + case_filter = sys.argv[1] if len(sys.argv) > 1 else None + + all_passed = True + for case in CASES: + if case_filter is not None and case["name"] != case_filter: + continue + + case_dir = case["name"] + shape = case["shape"] + vr, vc = case["valid_shape"] + + golden = np.fromfile(os.path.join(case_dir, "golden.bin"), dtype=case["dtype"]).reshape(shape) + output = np.fromfile(os.path.join(case_dir, "output.bin"), dtype=case["dtype"]).reshape(shape) + + ok = result_cmp(golden[:vr, :vc], output[:vr, :vc], case["eps"]) + if ok: + print(style_pass(f"[INFO] {case['name']}: compare passed")) + else: + print(style_fail(f"[ERROR] {case['name']}: compare failed")) + all_passed = False + + if not all_passed: + sys.exit(2) + print(style_pass("[INFO] all cases passed")) + + +if __name__ == "__main__": + main() diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trems/gen_data.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trems/gen_data.py new file mode 100644 index 000000000..02fcf6165 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trems/gen_data.py @@ -0,0 +1,46 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import numpy as np +from cases import CASES +from st_common import validate_cases, setup_case_rng, save_case_data + +# Scalar value for remainder (matches the scalar passed in launch.cpp) +SCALAR = 3.0 + +validate_cases(CASES) + +for case in CASES: + setup_case_rng(case) + + dtype = case["dtype"] + shape = case["shape"] + valid_shape = case["valid_shape"] + + input1 = np.random.randint(1, 10, size=shape).astype(dtype) + + golden = np.zeros(shape, dtype=dtype) + vr, vc = valid_shape + scalar_val = dtype(SCALAR) + if np.issubdtype(dtype, np.floating): + golden[:vr, :vc] = (input1[:vr, :vc] - np.trunc(input1[:vr, :vc] / scalar_val) * scalar_val).astype(dtype, copy=False) + else: + golden[:vr, :vc] = (input1[:vr, :vc] % scalar_val).astype(dtype, copy=False) + + save_case_data(case["name"], {"input1": input1, "golden": golden}) + print(f"[INFO] gen_data: {case['name']} shape={shape} valid_shape={valid_shape} dtype={dtype.__name__} scalar={SCALAR}") diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trems/launch.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trems/launch.cpp new file mode 100644 index 000000000..ea1571091 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trems/launch.cpp @@ -0,0 +1,31 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +#include + +#ifndef AICORE +#define AICORE [aicore] +#endif + +// Scalar value for remainder (must match gen_data.py SCALAR) +static constexpr float TREMS_SCALAR_F32 = 3.0f; + +// Case 0: f32 32x64 + +extern "C" __global__ AICORE void TREMS_f32_32x64(__gm__ float *src, __gm__ float *dst, float scalar); +extern "C" __global__ AICORE void TREMS_f16_63x64(__gm__ unsigned short *src, __gm__ unsigned short *dst, unsigned short scalar); + +void LaunchTREMS_f32_32x64(float *src, float *dst, void *stream) { + TREMS_f32_32x64<<<1, nullptr, stream>>>((__gm__ float *)src, (__gm__ float *)dst, TREMS_SCALAR_F32); +} + + + +void LaunchTREMS_f16_63x64(unsigned short *src, unsigned short *dst, void *stream) { + TREMS_f16_63x64<<<1, nullptr, stream>>>((__gm__ unsigned short *)src, (__gm__ unsigned short *)dst, (unsigned short)0x4200); +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trems/main.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trems/main.cpp new file mode 100644 index 000000000..3a61972ad --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trems/main.cpp @@ -0,0 +1,132 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// Host driver for TileLang trems ST — case-table driven. +// trems: dst = src - trunc(src/scalar) * scalar (integer remainder via vdiv). +// Numerical comparison is done externally by compare.py. + +#include "acl/acl.h" +#include "test_common.h" +#include +#include +#include +#include +#include +#include + +using namespace PtoTestCommon; + +// Kernel launch wrappers (defined in launch.cpp) +void LaunchTREMS_f32_32x64(float *src, float *dst, void *stream); +void LaunchTREMS_f16_63x64(uint16_t *src, uint16_t *dst, void *stream); +void LaunchTREMS_f32_256x16(float *src, float *dst, void *stream); + +struct TestCase { + const char *name; + void (*launch)(void *, void *, void *); // src, dst, stream + size_t rows; // allocated tile rows + size_t cols; // allocated tile cols + size_t validRows; // effective computation rows (<= rows) + size_t validCols; // effective computation cols (<= cols) + size_t elemSize; // bytes per element +}; + +static const TestCase kCases[] = { +{"f32_32x64", (void (*)(void*,void*,void*))LaunchTREMS_f32_32x64, 32, 64, 32, 64, sizeof(float)}, +{"f16_63x64", (void (*)(void*,void*,void*))LaunchTREMS_f16_63x64, 63, 64, 63, 64, sizeof(uint16_t)}, +}; +static constexpr size_t kNumCases = sizeof(kCases) / sizeof(kCases[0]); + +static int RunCase(const TestCase &tc, int deviceId, aclrtStream stream) { + int rc = 0; + const size_t elemCount = tc.rows * tc.cols; + const size_t fileSize = elemCount * tc.elemSize; + + std::printf("[INFO] === case: %s (shape=%zux%zu, valid=%zux%zu) ===\n", + tc.name, tc.rows, tc.cols, tc.validRows, tc.validCols); + + // Per-case data directory + std::string caseDir = std::string("./") + tc.name; + size_t srcFileSize = fileSize; + + void *srcHost = nullptr, *dstHost = nullptr; + void *srcDevice = nullptr, *dstDevice = nullptr; + + aclrtMallocHost(&srcHost, fileSize); + aclrtMallocHost(&dstHost, fileSize); + + aclrtMalloc(&srcDevice, fileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc(&dstDevice, fileSize, ACL_MEM_MALLOC_HUGE_FIRST); + + if (!ReadFile((caseDir + "/input1.bin").c_str(), srcFileSize, srcHost, fileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input1.bin\n", caseDir.c_str()); + rc = 1; + } + + if (rc == 0) { + aclrtMemcpy(srcDevice, fileSize, srcHost, fileSize, ACL_MEMCPY_HOST_TO_DEVICE); + + tc.launch(srcDevice, dstDevice, stream); + + aclrtSynchronizeStream(stream); + aclrtMemcpy(dstHost, fileSize, dstDevice, fileSize, ACL_MEMCPY_DEVICE_TO_HOST); + } + + if (rc == 0 && !WriteFile((caseDir + "/output.bin").c_str(), dstHost, fileSize)) { + std::fprintf(stderr, "[ERROR] failed to write %s/output.bin\n", caseDir.c_str()); + rc = 1; + } + + if (srcDevice != nullptr) + aclrtFree(srcDevice); + if (dstDevice != nullptr) + aclrtFree(dstDevice); + if (srcHost != nullptr) + aclrtFreeHost(srcHost); + if (dstHost != nullptr) + aclrtFreeHost(dstHost); + + if (rc == 0) + std::printf("[INFO] case %s done\n", tc.name); + return rc; +} + +int main(int argc, char *argv[]) { + // Optional case filter: ./trems [case_name] + const char *caseFilter = (argc > 1) ? argv[1] : nullptr; + + int rc = 0; + int deviceId = 0; + aclrtStream stream = nullptr; + + aclInit(nullptr); + if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { + deviceId = std::atoi(envDevice); + } + aclrtSetDevice(deviceId); + aclrtCreateStream(&stream); + + for (size_t i = 0; i < kNumCases; ++i) { + if (caseFilter != nullptr && std::strcmp(kCases[i].name, caseFilter) != 0) { + continue; + } + int ret = RunCase(kCases[i], deviceId, stream); + if (ret != 0) { + std::fprintf(stderr, "[ERROR] case %s failed\n", kCases[i].name); + rc = 1; + break; + } + } + + if (stream != nullptr) + aclrtDestroyStream(stream); + aclrtResetDevice(deviceId); + aclFinalize(); + + return rc; +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trems/trems.pto b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trems/trems.pto new file mode 100644 index 000000000..3ed820cf7 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trems/trems.pto @@ -0,0 +1,108 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// TileLang ST kernels for pto.trems: tload(src) + trems(src, scalar, tmp)->dst + tstore(dst). +// Integer remainder via vdiv: dst = src - trunc(src/scalar) * scalar. +// All types: f32, f16, i32, i16. +// Compiled by ptoas --enable-insert-sync --enable-tile-op-expand --pto-backend=vpto +// to produce a fatobj object. + +module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { + + // Case 0: f32 32x64 (2048 elements) + func.func @TREMS_f32_32x64(%src_ptr: !pto.ptr, %dst_ptr: !pto.ptr, %scalar: f32) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c32 = arith.constant 32 : index + %c64 = arith.constant 64 : index + %c2048 = arith.constant 2048 : index + + %src_view = pto.make_tensor_view %src_ptr, + shape = [%c1, %c1, %c1, %c32, %c64], + strides = [%c2048, %c2048, %c2048, %c64, %c1] + : !pto.tensor_view<1x1x1x32x64xf32> + %dst_view = pto.make_tensor_view %dst_ptr, + shape = [%c1, %c1, %c1, %c32, %c64], + strides = [%c2048, %c2048, %c2048, %c64, %c1] + : !pto.tensor_view<1x1x1x32x64xf32> + + %src_part = pto.partition_view %src_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c32, %c64] + : !pto.tensor_view<1x1x1x32x64xf32> -> !pto.partition_tensor_view<1x1x1x32x64xf32> + %dst_part = pto.partition_view %dst_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c32, %c64] + : !pto.tensor_view<1x1x1x32x64xf32> -> !pto.partition_tensor_view<1x1x1x32x64xf32> + + %src = pto.alloc_tile + : !pto.tile_buf + %dst = pto.alloc_tile + : !pto.tile_buf + %tmp = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%src_part : !pto.partition_tensor_view<1x1x1x32x64xf32>) + outs(%src : !pto.tile_buf) + pto.trems ins(%src, %scalar, %tmp : !pto.tile_buf, f32, + !pto.tile_buf) + outs(%dst : !pto.tile_buf) + pto.tstore ins(%dst : !pto.tile_buf) + outs(%dst_part : !pto.partition_tensor_view<1x1x1x32x64xf32>) + return + } + + // Case 1: f16 63x64 (4032 elements) + + func.func @TREMS_f16_63x64(%src_ptr: !pto.ptr, %dst_ptr: !pto.ptr, %scalar: f16) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c63 = arith.constant 63 : index + %c64 = arith.constant 64 : index + %c4032 = arith.constant 4032 : index + + %src_view = pto.make_tensor_view %src_ptr, + shape = [%c1, %c1, %c1, %c63, %c64], + strides = [%c4032, %c4032, %c4032, %c64, %c1] + : !pto.tensor_view<1x1x1x63x64xf16> + %dst_view = pto.make_tensor_view %dst_ptr, + shape = [%c1, %c1, %c1, %c63, %c64], + strides = [%c4032, %c4032, %c4032, %c64, %c1] + : !pto.tensor_view<1x1x1x63x64xf16> + + %src_part = pto.partition_view %src_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c63, %c64] + : !pto.tensor_view<1x1x1x63x64xf16> -> !pto.partition_tensor_view<1x1x1x63x64xf16> + %dst_part = pto.partition_view %dst_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c63, %c64] + : !pto.tensor_view<1x1x1x63x64xf16> -> !pto.partition_tensor_view<1x1x1x63x64xf16> + + %src = pto.alloc_tile + : !pto.tile_buf + %dst = pto.alloc_tile + : !pto.tile_buf + %tmp = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%src_part : !pto.partition_tensor_view<1x1x1x63x64xf16>) + outs(%src : !pto.tile_buf) + pto.trems ins(%src, %scalar, %tmp : !pto.tile_buf, f16, + !pto.tile_buf) + outs(%dst : !pto.tile_buf) + pto.tstore ins(%dst : !pto.tile_buf) + outs(%dst_part : !pto.partition_tensor_view<1x1x1x63x64xf16>) + return + } + + // Case 2: i32 31x128 (3968 elements) - SKIPPED: vdiv does not support integer types on A5 hardware + // Case 3: i16 15x192 (2880 elements) - SKIPPED: vdiv does not support integer types on A5 hardware + + // Case 4: f32 7x448 (3136 elements) +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowargmax/CMakeLists.txt b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowargmax/CMakeLists.txt new file mode 100644 index 000000000..42aec9129 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowargmax/CMakeLists.txt @@ -0,0 +1,9 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +pto_tilelang_vec_st(trowargmax) diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowargmax/cases.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowargmax/cases.py new file mode 100644 index 000000000..be6b44ed5 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowargmax/cases.py @@ -0,0 +1,222 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +"""Single source of truth for trowargmax ST test cases — aligned with pto-isa.""" + +import numpy as np + +CASES = [ + # uint32_dst + float32_src + { + "name": "uint32_float_8x1_8x8_8x8", + "dtype": np.float32, + "dst_dtype": np.uint32, + "shape": (8, 8), + "valid_shape": (8, 8), + "eps": 0, + }, + { + "name": "uint32_float_1024x1_1024x8_1024x8", + "dtype": np.float32, + "dst_dtype": np.uint32, + "shape": (1024, 8), + "valid_shape": (1024, 8), + "eps": 0, + }, + { + "name": "uint32_float_16x1_13x16_13x13", + "dtype": np.float32, + "dst_dtype": np.uint32, + "shape": (13, 16), + "valid_shape": (13, 13), + "eps": 0, + }, + { + "name": "uint32_float_1024x1_1023x24_1023x17", + "dtype": np.float32, + "dst_dtype": np.uint32, + "shape": (1023, 24), + "valid_shape": (1023, 17), + "eps": 0, + }, + { + "name": "uint32_float_8x1_8x64_8x64", + "dtype": np.float32, + "dst_dtype": np.uint32, + "shape": (8, 64), + "valid_shape": (8, 64), + "eps": 0, + }, + { + "name": "uint32_float_264x1_260x64_260x64", + "dtype": np.float32, + "dst_dtype": np.uint32, + "shape": (260, 64), + "valid_shape": (260, 64), + "eps": 0, + }, + { + "name": "uint32_float_8x1_1x128_1x128", + "dtype": np.float32, + "dst_dtype": np.uint32, + "shape": (1, 128), + "valid_shape": (1, 128), + "eps": 0, + }, + { + "name": "uint32_float_64x1_32x128_32x128", + "dtype": np.float32, + "dst_dtype": np.uint32, + "shape": (32, 128), + "valid_shape": (32, 128), + "eps": 0, + }, + { + "name": "uint32_float_8x1_3x4096_3x4095", + "dtype": np.float32, + "dst_dtype": np.uint32, + "shape": (3, 4096), + "valid_shape": (3, 4095), + "eps": 0, + }, + { + "name": "uint32_float_8x1_2x16384_2x16381", + "dtype": np.float32, + "dst_dtype": np.uint32, + "shape": (2, 16384), + "valid_shape": (2, 16381), + "eps": 0, + }, + # uint32_dst + float16_src + { + "name": "uint32_half_16x1_2x16_2x16", + "dtype": np.float16, + "dst_dtype": np.uint32, + "shape": (2, 16), + "valid_shape": (2, 16), + "eps": 0, + }, + { + "name": "uint32_half_16x1_13x16_13x13", + "dtype": np.float16, + "dst_dtype": np.uint32, + "shape": (13, 16), + "valid_shape": (13, 13), + "eps": 0, + }, + { + "name": "uint32_half_272x1_260x64_260x64", + "dtype": np.float16, + "dst_dtype": np.uint32, + "shape": (260, 64), + "valid_shape": (260, 64), + "eps": 0, + }, + { + "name": "uint32_half_16x1_3x8192_3x8191", + "dtype": np.float16, + "dst_dtype": np.uint32, + "shape": (3, 8192), + "valid_shape": (3, 8191), + "eps": 0, + }, + { + "name": "uint32_half_16x1_1x16384_1x16381", + "dtype": np.float16, + "dst_dtype": np.uint32, + "shape": (1, 16384), + "valid_shape": (1, 16381), + "eps": 0, + }, + { + "name": "uint32_half_16x1_1x32768_1x32761", + "dtype": np.float16, + "dst_dtype": np.uint32, + "shape": (1, 32768), + "valid_shape": (1, 32761), + "eps": 0, + }, + # int32_dst + float32_src + { + "name": "int32_float_16x1_13x16_13x13", + "dtype": np.float32, + "dst_dtype": np.int32, + "shape": (13, 16), + "valid_shape": (13, 13), + "eps": 0, + }, + # int32_dst + float16_src + { + "name": "int32_half_16x1_13x16_13x13", + "dtype": np.float16, + "dst_dtype": np.int32, + "shape": (13, 16), + "valid_shape": (13, 13), + "eps": 0, + }, + # uint32_dst + float32_src (dst col > 1) + { + "name": "uint32_float_3x8_3x3480_3x3473", + "dtype": np.float32, + "dst_dtype": np.uint32, + "shape": (3, 3480), + "valid_shape": (3, 3473), + "eps": 0, + }, + { + "name": "uint32_float_260x8_260x64_260x64", + "dtype": np.float32, + "dst_dtype": np.uint32, + "shape": (260, 64), + "valid_shape": (260, 64), + "eps": 0, + }, + { + "name": "uint32_float_1023x8_1023x24_1023x17", + "dtype": np.float32, + "dst_dtype": np.uint32, + "shape": (1023, 24), + "valid_shape": (1023, 17), + "eps": 0, + }, + # uint32_dst + float16_src (dst col > 1) + { + "name": "uint32_half_3x16_3x3488_3x3473", + "dtype": np.float16, + "dst_dtype": np.uint32, + "shape": (3, 3488), + "valid_shape": (3, 3473), + "eps": 0, + }, + { + "name": "uint32_half_260x16_260x64_260x64", + "dtype": np.float16, + "dst_dtype": np.uint32, + "shape": (260, 64), + "valid_shape": (260, 64), + "eps": 0, + }, + { + "name": "uint32_half_1023x16_1023x32_1023x17", + "dtype": np.float16, + "dst_dtype": np.uint32, + "shape": (1023, 32), + "valid_shape": (1023, 17), + "eps": 0, + }, +] + +_SMOKE_CASE_NAMES = ['uint32_float_8x1_8x8_8x8', 'int32_half_16x1_13x16_13x13'] +_SMOKE_CASE_NAME_SET = set(_SMOKE_CASE_NAMES) +_missing = [name for name in _SMOKE_CASE_NAMES if name not in {case["name"] for case in CASES}] +if _missing: + raise RuntimeError("unknown smoke case(s): " + ", ".join(_missing)) +CASES = [case for case in CASES if case["name"] in _SMOKE_CASE_NAME_SET] diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowargmax/compare.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowargmax/compare.py new file mode 100644 index 000000000..4cd015fd3 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowargmax/compare.py @@ -0,0 +1,52 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import os +import sys +import numpy as np + +from cases import CASES +from st_common import result_cmp, style_fail, style_pass, validate_cases + + +def main(): + validate_cases(CASES) + case_filter = sys.argv[1] if len(sys.argv) > 1 else None + + all_passed = True + for case in CASES: + if case_filter is not None and case["name"] != case_filter: + continue + + case_dir = case["name"] + vr, vc = case["valid_shape"] + out_shape = (vr, 1) + + golden = np.fromfile(os.path.join(case_dir, "golden.bin"), dtype=case["dst_dtype"], count=np.prod(out_shape)).reshape(out_shape) + + output_full = np.fromfile(os.path.join(case_dir, "output.bin"), dtype=case["dst_dtype"]) + dst_cols = len(output_full) // vr + output = output_full.reshape(vr, dst_cols)[:, 0:1] + + ok = result_cmp(golden, output, case["eps"]) + if ok: + print(style_pass(f"[INFO] {case['name']}: compare passed")) + else: + print(style_fail(f"[ERROR] {case['name']}: compare failed")) + all_passed = False + + if not all_passed: + sys.exit(2) + print(style_pass("[INFO] all cases passed")) + + +if __name__ == "__main__": + main() diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowargmax/gen_data.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowargmax/gen_data.py new file mode 100644 index 000000000..3016b948f --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowargmax/gen_data.py @@ -0,0 +1,38 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import numpy as np +from cases import CASES +from st_common import validate_cases, setup_case_rng, save_case_data + +validate_cases(CASES) + +for case in CASES: + setup_case_rng(case) + + dtype = case["dtype"] + dst_dtype = case["dst_dtype"] + shape = case["shape"] + valid_shape = case["valid_shape"] + + if dtype in (np.int8, np.uint8, np.int16, np.uint16, np.int32, np.uint32): + dtype_info = np.iinfo(dtype) + input1 = np.random.randint(dtype_info.min, dtype_info.max, size=shape).astype(dtype) + else: + dtype_info = np.finfo(dtype) + input1 = np.random.uniform(low=dtype_info.min, high=dtype_info.max, size=shape).astype(dtype) + + out_shape = (valid_shape[0], 1) + golden = np.zeros(out_shape, dtype=dst_dtype) + golden[:, 0:1] = np.argmax(input1[:, :valid_shape[1]], axis=1, keepdims=True).astype(dst_dtype) + + save_case_data(case["name"], {"input1": input1, "golden": golden}) + print(f"[INFO] gen_data: {case['name']} shape={shape} valid_shape={valid_shape} dtype={dtype.__name__}") diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowargmax/launch.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowargmax/launch.cpp new file mode 100644 index 000000000..56d95545e --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowargmax/launch.cpp @@ -0,0 +1,26 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +#include + +#ifndef AICORE +#define AICORE [aicore] +#endif + +extern "C" __global__ AICORE void TROWARGMAX_uint32_float_8x1_8x8_8x8(__gm__ float *src, __gm__ uint32_t *dst); +extern "C" __global__ AICORE void TROWARGMAX_int32_half_16x1_13x16_13x13(__gm__ uint16_t *src, __gm__ int32_t *dst); + +void LaunchTROWARGMAX_int32_half_16x1_13x16_13x13(uint16_t *src, int32_t *dst, void *stream) { + TROWARGMAX_int32_half_16x1_13x16_13x13<<<1, nullptr, stream>>>((__gm__ uint16_t *)src, (__gm__ int32_t *)dst); +} + + + +void LaunchTROWARGMAX_uint32_float_8x1_8x8_8x8(float *src, uint32_t *dst, void *stream) { + TROWARGMAX_uint32_float_8x1_8x8_8x8<<<1, nullptr, stream>>>((__gm__ float *)src, (__gm__ uint32_t *)dst); +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowargmax/main.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowargmax/main.cpp new file mode 100644 index 000000000..157fd961e --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowargmax/main.cpp @@ -0,0 +1,177 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// Host driver for TileLang trowargmax ST — case-table driven. + +#include "acl/acl.h" +#include "test_common.h" +#include +#include +#include +#include +#include +#include + +using namespace PtoTestCommon; + +// Kernel launch wrappers (defined in launch.cpp) +void LaunchTROWARGMAX_uint32_float_8x1_8x8_8x8(float *src, uint32_t *dst, void *stream); +void LaunchTROWARGMAX_uint32_float_1024x1_1024x8_1024x8(float *src, uint32_t *dst, void *stream); +void LaunchTROWARGMAX_uint32_float_1024x1_1023x24_1023x17(float *src, uint32_t *dst, void *stream); +void LaunchTROWARGMAX_uint32_float_264x1_260x64_260x64(float *src, uint32_t *dst, void *stream); +void LaunchTROWARGMAX_uint32_float_64x1_32x128_32x128(float *src, uint32_t *dst, void *stream); +void LaunchTROWARGMAX_uint32_float_8x1_2x16384_2x16381(float *src, uint32_t *dst, void *stream); +void LaunchTROWARGMAX_uint32_half_16x1_13x16_13x13(uint16_t *src, uint32_t *dst, void *stream); +void LaunchTROWARGMAX_uint32_half_16x1_3x8192_3x8191(uint16_t *src, uint32_t *dst, void *stream); +void LaunchTROWARGMAX_uint32_half_16x1_1x32768_1x32761(uint16_t *src, uint32_t *dst, void *stream); +void LaunchTROWARGMAX_int32_half_16x1_13x16_13x13(uint16_t *src, int32_t *dst, void *stream); +void LaunchTROWARGMAX_uint32_float_260x8_260x64_260x64(float *src, uint32_t *dst, void *stream); +void LaunchTROWARGMAX_uint32_half_3x16_3x3488_3x3473(uint16_t *src, uint32_t *dst, void *stream); +void LaunchTROWARGMAX_uint32_half_1023x16_1023x32_1023x17(uint16_t *src, uint32_t *dst, void *stream); + +using LaunchFnF32U32 = void (*)(float *, uint32_t *, void *); +using LaunchFnF16U32 = void (*)(uint16_t *, uint32_t *, void *); +using LaunchFnF32S32 = void (*)(float *, int32_t *, void *); +using LaunchFnF16S32 = void (*)(uint16_t *, int32_t *, void *); + +enum class DType { F32U32, F16U32, F32S32, F16S32 }; + +struct TestCase { + const char *name; + DType dtype; + union { + LaunchFnF32U32 launchF32U32; + LaunchFnF16U32 launchF16U32; + LaunchFnF32S32 launchF32S32; + LaunchFnF16S32 launchF16S32; + }; + size_t rows; // allocated tile rows + size_t cols; // allocated tile cols + size_t validRows; // effective computation rows (<= rows) + size_t validCols; // effective computation cols (<= cols) + size_t srcElemSize; // bytes per src element + size_t dstElemSize; // bytes per dst element + size_t dstCols; // dst tile cols +}; + +static const TestCase kCases[] = { +{"uint32_float_8x1_8x8_8x8", DType::F32U32, .launchF32U32 = LaunchTROWARGMAX_uint32_float_8x1_8x8_8x8, 8, 8, 8, 8, 4, 4, 1}, +{"int32_half_16x1_13x16_13x13", DType::F16S32, .launchF16S32 = LaunchTROWARGMAX_int32_half_16x1_13x16_13x13, 13, 16, 13, 13, 2, 4, 1}, +}; +static constexpr size_t kNumCases = sizeof(kCases) / sizeof(kCases[0]); + +static int RunCase(const TestCase &tc, int deviceId, aclrtStream stream) { + int rc = 0; + const size_t srcElemCount = tc.rows * tc.cols; + const size_t srcFileSize = srcElemCount * tc.srcElemSize; + const size_t dstElemCount = tc.validRows * tc.dstCols; + const size_t dstFileSize = dstElemCount * tc.dstElemSize; + + std::printf("[INFO] === case: %s (shape=%zux%zu, valid=%zux%zu) ===\n", + tc.name, tc.rows, tc.cols, tc.validRows, tc.validCols); + + // Per-case data directory + std::string caseDir = std::string("./") + tc.name; + size_t src0FileSize = srcFileSize; + + void *src0Host = nullptr, *dstHost = nullptr; + void *src0Device = nullptr, *dstDevice = nullptr; + + aclrtMallocHost(&src0Host, srcFileSize); + aclrtMallocHost(&dstHost, dstFileSize); + + aclrtMalloc(&src0Device, srcFileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc(&dstDevice, dstFileSize, ACL_MEM_MALLOC_HUGE_FIRST); + + if (rc == 0) { + aclrtMemset(dstDevice, dstFileSize, 0, dstFileSize); + } + + if (!ReadFile((caseDir + "/input1.bin").c_str(), src0FileSize, src0Host, srcFileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input1.bin\n", caseDir.c_str()); + rc = 1; + } + + if (rc == 0) { + aclrtMemcpy(src0Device, srcFileSize, src0Host, srcFileSize, ACL_MEMCPY_HOST_TO_DEVICE); + + switch (tc.dtype) { + case DType::F32U32: + tc.launchF32U32((float *)src0Device, (uint32_t *)dstDevice, stream); + break; + case DType::F16U32: + tc.launchF16U32((uint16_t *)src0Device, (uint32_t *)dstDevice, stream); + break; + case DType::F32S32: + tc.launchF32S32((float *)src0Device, (int32_t *)dstDevice, stream); + break; + case DType::F16S32: + tc.launchF16S32((uint16_t *)src0Device, (int32_t *)dstDevice, stream); + break; + } + + aclrtSynchronizeStream(stream); + aclrtMemcpy(dstHost, dstFileSize, dstDevice, dstFileSize, ACL_MEMCPY_DEVICE_TO_HOST); + } + + if (rc == 0) { + mkdir(caseDir.c_str(), 0755); + if (!WriteFile((caseDir + "/output.bin").c_str(), dstHost, dstFileSize)) { + std::fprintf(stderr, "[ERROR] failed to write %s/output.bin\n", caseDir.c_str()); + rc = 1; + } + } + + if (src0Device != nullptr) + aclrtFree(src0Device); + if (dstDevice != nullptr) + aclrtFree(dstDevice); + if (src0Host != nullptr) + aclrtFreeHost(src0Host); + if (dstHost != nullptr) + aclrtFreeHost(dstHost); + + if (rc == 0) + std::printf("[INFO] case %s done\n", tc.name); + return rc; +} + +int main(int argc, char *argv[]) { + // Optional case filter: ./trowargmax [case_name] + const char *caseFilter = (argc > 1) ? argv[1] : nullptr; + + int rc = 0; + int deviceId = 0; + aclrtStream stream = nullptr; + + aclInit(nullptr); + if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { + deviceId = std::atoi(envDevice); + } + aclrtSetDevice(deviceId); + aclrtCreateStream(&stream); + + for (size_t i = 0; i < kNumCases; ++i) { + if (caseFilter != nullptr && std::strcmp(kCases[i].name, caseFilter) != 0) { + continue; + } + int ret = RunCase(kCases[i], deviceId, stream); + if (ret != 0) { + std::fprintf(stderr, "[ERROR] case %s failed\n", kCases[i].name); + rc = 1; + break; + } + } + + if (stream != nullptr) + aclrtDestroyStream(stream); + aclrtResetDevice(deviceId); + aclFinalize(); + + return rc; +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowargmax/trowargmax.pto b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowargmax/trowargmax.pto new file mode 100644 index 000000000..972a1a45d --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowargmax/trowargmax.pto @@ -0,0 +1,89 @@ +// Auto-generated trowargmax ST testcases + +module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { + + func.func @TROWARGMAX_uint32_float_8x1_8x8_8x8(%src_ptr: !pto.ptr, %dst_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c8_r = arith.constant 8 : index + %c8_c = arith.constant 8 : index + %c64_se = arith.constant 64 : index + %c8_de = arith.constant 8 : index + %c1_dc = arith.constant 1 : index + %c8_vr = arith.constant 8 : index + %c8_vc = arith.constant 8 : index + %src_view = pto.make_tensor_view %src_ptr, + shape = [%c1, %c1, %c1, %c8_r, %c8_c], + strides = [%c64_se, %c64_se, %c64_se, %c8_c, %c1] + : !pto.tensor_view<1x1x1x8x8xf32> + %dst_view = pto.make_tensor_view %dst_ptr, + shape = [%c1, %c1, %c1, %c8_r, %c1_dc], + strides = [%c8_de, %c8_de, %c8_de, %c1_dc, %c1] + : !pto.tensor_view<1x1x1x8x1xui32> + %src_part = pto.partition_view %src_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c8_vr, %c8_vc] + : !pto.tensor_view<1x1x1x8x8xf32> -> !pto.partition_tensor_view<1x1x1x8x8xf32> + %dst_part = pto.partition_view %dst_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c8_vr, %c1] + : !pto.tensor_view<1x1x1x8x1xui32> -> !pto.partition_tensor_view<1x1x1x8x1xui32> + %src = pto.alloc_tile + : !pto.tile_buf + %tmp = pto.alloc_tile + : !pto.tile_buf + %dst = pto.alloc_tile + : !pto.tile_buf + pto.tload ins(%src_part : !pto.partition_tensor_view<1x1x1x8x8xf32>) + outs(%src : !pto.tile_buf) + pto.trowargmax ins(%src, %tmp : !pto.tile_buf, + !pto.tile_buf) + outs(%dst : !pto.tile_buf) + pto.tstore ins(%dst : !pto.tile_buf) + outs(%dst_part : !pto.partition_tensor_view<1x1x1x8x1xui32>) + return + } + + + func.func @TROWARGMAX_int32_half_16x1_13x16_13x13(%src_ptr: !pto.ptr, %dst_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c13_r = arith.constant 13 : index + %c16_c = arith.constant 16 : index + %c208_se = arith.constant 208 : index + %c13_de = arith.constant 13 : index + %c1_dc = arith.constant 1 : index + %c13_vr = arith.constant 13 : index + %c13_vc = arith.constant 13 : index + %src_view = pto.make_tensor_view %src_ptr, + shape = [%c1, %c1, %c1, %c13_r, %c16_c], + strides = [%c208_se, %c208_se, %c208_se, %c16_c, %c1] + : !pto.tensor_view<1x1x1x13x16xf16> + %dst_view = pto.make_tensor_view %dst_ptr, + shape = [%c1, %c1, %c1, %c13_r, %c1_dc], + strides = [%c13_de, %c13_de, %c13_de, %c1_dc, %c1] + : !pto.tensor_view<1x1x1x13x1xi32> + %src_part = pto.partition_view %src_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c13_vr, %c13_vc] + : !pto.tensor_view<1x1x1x13x16xf16> -> !pto.partition_tensor_view<1x1x1x13x13xf16> + %dst_part = pto.partition_view %dst_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c13_vr, %c1] + : !pto.tensor_view<1x1x1x13x1xi32> -> !pto.partition_tensor_view<1x1x1x13x1xi32> + %src = pto.alloc_tile + : !pto.tile_buf + %tmp = pto.alloc_tile + : !pto.tile_buf + %dst = pto.alloc_tile + : !pto.tile_buf + pto.tload ins(%src_part : !pto.partition_tensor_view<1x1x1x13x13xf16>) + outs(%src : !pto.tile_buf) + pto.trowargmax ins(%src, %tmp : !pto.tile_buf, + !pto.tile_buf) + outs(%dst : !pto.tile_buf) + pto.tstore ins(%dst : !pto.tile_buf) + outs(%dst_part : !pto.partition_tensor_view<1x1x1x13x1xi32>) + return + } +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowargmin/CMakeLists.txt b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowargmin/CMakeLists.txt new file mode 100644 index 000000000..a6a8925b5 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowargmin/CMakeLists.txt @@ -0,0 +1,9 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +pto_tilelang_vec_st(trowargmin) diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowargmin/cases.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowargmin/cases.py new file mode 100644 index 000000000..003913c26 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowargmin/cases.py @@ -0,0 +1,222 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +"""Single source of truth for trowargmin ST test cases — aligned with pto-isa.""" + +import numpy as np + +CASES = [ + # uint32_dst + float32_src + { + "name": "uint32_float_8x1_8x8_8x8", + "dtype": np.float32, + "dst_dtype": np.uint32, + "shape": (8, 8), + "valid_shape": (8, 8), + "eps": 0, + }, + { + "name": "uint32_float_1024x1_1024x8_1024x8", + "dtype": np.float32, + "dst_dtype": np.uint32, + "shape": (1024, 8), + "valid_shape": (1024, 8), + "eps": 0, + }, + { + "name": "uint32_float_16x1_13x16_13x13", + "dtype": np.float32, + "dst_dtype": np.uint32, + "shape": (13, 16), + "valid_shape": (13, 13), + "eps": 0, + }, + { + "name": "uint32_float_1024x1_1023x24_1023x17", + "dtype": np.float32, + "dst_dtype": np.uint32, + "shape": (1023, 24), + "valid_shape": (1023, 17), + "eps": 0, + }, + { + "name": "uint32_float_8x1_8x64_8x64", + "dtype": np.float32, + "dst_dtype": np.uint32, + "shape": (8, 64), + "valid_shape": (8, 64), + "eps": 0, + }, + { + "name": "uint32_float_264x1_260x64_260x64", + "dtype": np.float32, + "dst_dtype": np.uint32, + "shape": (260, 64), + "valid_shape": (260, 64), + "eps": 0, + }, + { + "name": "uint32_float_8x1_1x128_1x128", + "dtype": np.float32, + "dst_dtype": np.uint32, + "shape": (1, 128), + "valid_shape": (1, 128), + "eps": 0, + }, + { + "name": "uint32_float_64x1_32x128_32x128", + "dtype": np.float32, + "dst_dtype": np.uint32, + "shape": (32, 128), + "valid_shape": (32, 128), + "eps": 0, + }, + { + "name": "uint32_float_8x1_3x4096_3x4095", + "dtype": np.float32, + "dst_dtype": np.uint32, + "shape": (3, 4096), + "valid_shape": (3, 4095), + "eps": 0, + }, + { + "name": "uint32_float_8x1_2x16384_2x16381", + "dtype": np.float32, + "dst_dtype": np.uint32, + "shape": (2, 16384), + "valid_shape": (2, 16381), + "eps": 0, + }, + # uint32_dst + float16_src + { + "name": "uint32_half_16x1_2x16_2x16", + "dtype": np.float16, + "dst_dtype": np.uint32, + "shape": (2, 16), + "valid_shape": (2, 16), + "eps": 0, + }, + { + "name": "uint32_half_16x1_13x16_13x13", + "dtype": np.float16, + "dst_dtype": np.uint32, + "shape": (13, 16), + "valid_shape": (13, 13), + "eps": 0, + }, + { + "name": "uint32_half_272x1_260x64_260x64", + "dtype": np.float16, + "dst_dtype": np.uint32, + "shape": (260, 64), + "valid_shape": (260, 64), + "eps": 0, + }, + { + "name": "uint32_half_16x1_3x8192_3x8191", + "dtype": np.float16, + "dst_dtype": np.uint32, + "shape": (3, 8192), + "valid_shape": (3, 8191), + "eps": 0, + }, + { + "name": "uint32_half_16x1_1x16384_1x16381", + "dtype": np.float16, + "dst_dtype": np.uint32, + "shape": (1, 16384), + "valid_shape": (1, 16381), + "eps": 0, + }, + { + "name": "uint32_half_16x1_1x32768_1x32761", + "dtype": np.float16, + "dst_dtype": np.uint32, + "shape": (1, 32768), + "valid_shape": (1, 32761), + "eps": 0, + }, + # int32_dst + float32_src + { + "name": "int32_float_16x1_13x16_13x13", + "dtype": np.float32, + "dst_dtype": np.int32, + "shape": (13, 16), + "valid_shape": (13, 13), + "eps": 0, + }, + # int32_dst + float16_src + { + "name": "int32_half_16x1_13x16_13x13", + "dtype": np.float16, + "dst_dtype": np.int32, + "shape": (13, 16), + "valid_shape": (13, 13), + "eps": 0, + }, + # uint32_dst + float32_src (dst col > 1) + { + "name": "uint32_float_3x8_3x3480_3x3473", + "dtype": np.float32, + "dst_dtype": np.uint32, + "shape": (3, 3480), + "valid_shape": (3, 3473), + "eps": 0, + }, + { + "name": "uint32_float_260x8_260x64_260x64", + "dtype": np.float32, + "dst_dtype": np.uint32, + "shape": (260, 64), + "valid_shape": (260, 64), + "eps": 0, + }, + { + "name": "uint32_float_1023x8_1023x24_1023x17", + "dtype": np.float32, + "dst_dtype": np.uint32, + "shape": (1023, 24), + "valid_shape": (1023, 17), + "eps": 0, + }, + # uint32_dst + float16_src (dst col > 1) + { + "name": "uint32_half_3x16_3x3488_3x3473", + "dtype": np.float16, + "dst_dtype": np.uint32, + "shape": (3, 3488), + "valid_shape": (3, 3473), + "eps": 0, + }, + { + "name": "uint32_half_260x16_260x64_260x64", + "dtype": np.float16, + "dst_dtype": np.uint32, + "shape": (260, 64), + "valid_shape": (260, 64), + "eps": 0, + }, + { + "name": "uint32_half_1023x16_1023x32_1023x17", + "dtype": np.float16, + "dst_dtype": np.uint32, + "shape": (1023, 32), + "valid_shape": (1023, 17), + "eps": 0, + }, +] + +_SMOKE_CASE_NAMES = ['uint32_float_8x1_8x8_8x8', 'int32_half_16x1_13x16_13x13'] +_SMOKE_CASE_NAME_SET = set(_SMOKE_CASE_NAMES) +_missing = [name for name in _SMOKE_CASE_NAMES if name not in {case["name"] for case in CASES}] +if _missing: + raise RuntimeError("unknown smoke case(s): " + ", ".join(_missing)) +CASES = [case for case in CASES if case["name"] in _SMOKE_CASE_NAME_SET] diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowargmin/compare.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowargmin/compare.py new file mode 100644 index 000000000..4cd015fd3 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowargmin/compare.py @@ -0,0 +1,52 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import os +import sys +import numpy as np + +from cases import CASES +from st_common import result_cmp, style_fail, style_pass, validate_cases + + +def main(): + validate_cases(CASES) + case_filter = sys.argv[1] if len(sys.argv) > 1 else None + + all_passed = True + for case in CASES: + if case_filter is not None and case["name"] != case_filter: + continue + + case_dir = case["name"] + vr, vc = case["valid_shape"] + out_shape = (vr, 1) + + golden = np.fromfile(os.path.join(case_dir, "golden.bin"), dtype=case["dst_dtype"], count=np.prod(out_shape)).reshape(out_shape) + + output_full = np.fromfile(os.path.join(case_dir, "output.bin"), dtype=case["dst_dtype"]) + dst_cols = len(output_full) // vr + output = output_full.reshape(vr, dst_cols)[:, 0:1] + + ok = result_cmp(golden, output, case["eps"]) + if ok: + print(style_pass(f"[INFO] {case['name']}: compare passed")) + else: + print(style_fail(f"[ERROR] {case['name']}: compare failed")) + all_passed = False + + if not all_passed: + sys.exit(2) + print(style_pass("[INFO] all cases passed")) + + +if __name__ == "__main__": + main() diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowargmin/gen_data.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowargmin/gen_data.py new file mode 100644 index 000000000..6c103094c --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowargmin/gen_data.py @@ -0,0 +1,38 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import numpy as np +from cases import CASES +from st_common import validate_cases, setup_case_rng, save_case_data + +validate_cases(CASES) + +for case in CASES: + setup_case_rng(case) + + dtype = case["dtype"] + dst_dtype = case["dst_dtype"] + shape = case["shape"] + valid_shape = case["valid_shape"] + + if dtype in (np.int8, np.uint8, np.int16, np.uint16, np.int32, np.uint32): + dtype_info = np.iinfo(dtype) + input1 = np.random.randint(dtype_info.min, dtype_info.max, size=shape).astype(dtype) + else: + dtype_info = np.finfo(dtype) + input1 = np.random.uniform(low=dtype_info.min, high=dtype_info.max, size=shape).astype(dtype) + + out_shape = (valid_shape[0], 1) + golden = np.zeros(out_shape, dtype=dst_dtype) + golden[:, 0:1] = np.argmin(input1[:, :valid_shape[1]], axis=1, keepdims=True).astype(dst_dtype) + + save_case_data(case["name"], {"input1": input1, "golden": golden}) + print(f"[INFO] gen_data: {case['name']} shape={shape} valid_shape={valid_shape} dtype={dtype.__name__}") diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowargmin/launch.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowargmin/launch.cpp new file mode 100644 index 000000000..2ce44520d --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowargmin/launch.cpp @@ -0,0 +1,26 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +#include + +#ifndef AICORE +#define AICORE [aicore] +#endif + +extern "C" __global__ AICORE void TROWARGMIN_uint32_float_8x1_8x8_8x8(__gm__ float *src, __gm__ uint32_t *dst); +extern "C" __global__ AICORE void TROWARGMIN_int32_half_16x1_13x16_13x13(__gm__ uint16_t *src, __gm__ int32_t *dst); + +void LaunchTROWARGMIN_uint32_float_8x1_8x8_8x8(float *src, uint32_t *dst, void *stream) { + TROWARGMIN_uint32_float_8x1_8x8_8x8<<<1, nullptr, stream>>>((__gm__ float *)src, (__gm__ uint32_t *)dst); +} + + + +void LaunchTROWARGMIN_int32_half_16x1_13x16_13x13(uint16_t *src, int32_t *dst, void *stream) { + TROWARGMIN_int32_half_16x1_13x16_13x13<<<1, nullptr, stream>>>((__gm__ uint16_t *)src, (__gm__ int32_t *)dst); +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowargmin/main.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowargmin/main.cpp new file mode 100644 index 000000000..ceeb57295 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowargmin/main.cpp @@ -0,0 +1,173 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// Host driver for TileLang trowargmin ST — case-table driven. + +#include "acl/acl.h" +#include "test_common.h" +#include +#include +#include +#include +#include +#include + +using namespace PtoTestCommon; + +// Kernel launch wrappers (defined in launch.cpp) +void LaunchTROWARGMIN_uint32_float_8x1_8x8_8x8(float *src, uint32_t *dst, void *stream); +void LaunchTROWARGMIN_uint32_float_1024x1_1024x8_1024x8(float *src, uint32_t *dst, void *stream); +void LaunchTROWARGMIN_uint32_float_1024x1_1023x24_1023x17(float *src, uint32_t *dst, void *stream); +void LaunchTROWARGMIN_uint32_float_264x1_260x64_260x64(float *src, uint32_t *dst, void *stream); +void LaunchTROWARGMIN_uint32_float_64x1_32x128_32x128(float *src, uint32_t *dst, void *stream); +void LaunchTROWARGMIN_uint32_float_8x1_2x16384_2x16381(float *src, uint32_t *dst, void *stream); +void LaunchTROWARGMIN_uint32_half_16x1_13x16_13x13(uint16_t *src, uint32_t *dst, void *stream); +void LaunchTROWARGMIN_uint32_half_16x1_3x8192_3x8191(uint16_t *src, uint32_t *dst, void *stream); +void LaunchTROWARGMIN_uint32_half_16x1_1x32768_1x32761(uint16_t *src, uint32_t *dst, void *stream); +void LaunchTROWARGMIN_int32_half_16x1_13x16_13x13(uint16_t *src, int32_t *dst, void *stream); +void LaunchTROWARGMIN_uint32_float_260x8_260x64_260x64(float *src, uint32_t *dst, void *stream); +void LaunchTROWARGMIN_uint32_half_3x16_3x3488_3x3473(uint16_t *src, uint32_t *dst, void *stream); +void LaunchTROWARGMIN_uint32_half_1023x16_1023x32_1023x17(uint16_t *src, uint32_t *dst, void *stream); + +using LaunchFnF32U32 = void (*)(float *, uint32_t *, void *); +using LaunchFnF16U32 = void (*)(uint16_t *, uint32_t *, void *); +using LaunchFnF32S32 = void (*)(float *, int32_t *, void *); +using LaunchFnF16S32 = void (*)(uint16_t *, int32_t *, void *); + +enum class DType { F32U32, F16U32, F32S32, F16S32 }; + +struct TestCase { + const char *name; + DType dtype; + union { + LaunchFnF32U32 launchF32U32; + LaunchFnF16U32 launchF16U32; + LaunchFnF32S32 launchF32S32; + LaunchFnF16S32 launchF16S32; + }; + size_t rows; // allocated tile rows + size_t cols; // allocated tile cols + size_t validRows; // effective computation rows (<= rows) + size_t validCols; // effective computation cols (<= cols) + size_t srcElemSize; // bytes per src element + size_t dstElemSize; // bytes per dst element + size_t dstCols; // dst tile cols +}; + +static const TestCase kCases[] = { +{"uint32_float_8x1_8x8_8x8", DType::F32U32, .launchF32U32 = LaunchTROWARGMIN_uint32_float_8x1_8x8_8x8, 8, 8, 8, 8, 4, 4, 1}, +{"int32_half_16x1_13x16_13x13", DType::F16S32, .launchF16S32 = LaunchTROWARGMIN_int32_half_16x1_13x16_13x13, 13, 16, 13, 13, 2, 4, 1}, +}; +static constexpr size_t kNumCases = sizeof(kCases) / sizeof(kCases[0]); + +static int RunCase(const TestCase &tc, int deviceId, aclrtStream stream) { + int rc = 0; + const size_t srcElemCount = tc.rows * tc.cols; + const size_t srcFileSize = srcElemCount * tc.srcElemSize; + const size_t dstElemCount = tc.validRows * tc.dstCols; + const size_t dstFileSize = dstElemCount * tc.dstElemSize; + + std::printf("[INFO] === case: %s (shape=%zux%zu, valid=%zux%zu) ===\n", + tc.name, tc.rows, tc.cols, tc.validRows, tc.validCols); + + // Per-case data directory + std::string caseDir = std::string("./") + tc.name; + size_t src0FileSize = srcFileSize; + + void *src0Host = nullptr, *dstHost = nullptr; + void *src0Device = nullptr, *dstDevice = nullptr; + + aclrtMallocHost(&src0Host, srcFileSize); + aclrtMallocHost(&dstHost, dstFileSize); + + aclrtMalloc(&src0Device, srcFileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc(&dstDevice, dstFileSize, ACL_MEM_MALLOC_HUGE_FIRST); + + if (!ReadFile((caseDir + "/input1.bin").c_str(), src0FileSize, src0Host, srcFileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input1.bin\n", caseDir.c_str()); + rc = 1; + } + + if (rc == 0) { + aclrtMemcpy(src0Device, srcFileSize, src0Host, srcFileSize, ACL_MEMCPY_HOST_TO_DEVICE); + + switch (tc.dtype) { + case DType::F32U32: + tc.launchF32U32((float *)src0Device, (uint32_t *)dstDevice, stream); + break; + case DType::F16U32: + tc.launchF16U32((uint16_t *)src0Device, (uint32_t *)dstDevice, stream); + break; + case DType::F32S32: + tc.launchF32S32((float *)src0Device, (int32_t *)dstDevice, stream); + break; + case DType::F16S32: + tc.launchF16S32((uint16_t *)src0Device, (int32_t *)dstDevice, stream); + break; + } + + aclrtSynchronizeStream(stream); + aclrtMemcpy(dstHost, dstFileSize, dstDevice, dstFileSize, ACL_MEMCPY_DEVICE_TO_HOST); + } + + if (rc == 0) { + mkdir(caseDir.c_str(), 0755); + if (!WriteFile((caseDir + "/output.bin").c_str(), dstHost, dstFileSize)) { + std::fprintf(stderr, "[ERROR] failed to write %s/output.bin\n", caseDir.c_str()); + rc = 1; + } + } + + if (src0Device != nullptr) + aclrtFree(src0Device); + if (dstDevice != nullptr) + aclrtFree(dstDevice); + if (src0Host != nullptr) + aclrtFreeHost(src0Host); + if (dstHost != nullptr) + aclrtFreeHost(dstHost); + + if (rc == 0) + std::printf("[INFO] case %s done\n", tc.name); + return rc; +} + +int main(int argc, char *argv[]) { + // Optional case filter: ./trowargmin [case_name] + const char *caseFilter = (argc > 1) ? argv[1] : nullptr; + + int rc = 0; + int deviceId = 0; + aclrtStream stream = nullptr; + + aclInit(nullptr); + if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { + deviceId = std::atoi(envDevice); + } + aclrtSetDevice(deviceId); + aclrtCreateStream(&stream); + + for (size_t i = 0; i < kNumCases; ++i) { + if (caseFilter != nullptr && std::strcmp(kCases[i].name, caseFilter) != 0) { + continue; + } + int ret = RunCase(kCases[i], deviceId, stream); + if (ret != 0) { + std::fprintf(stderr, "[ERROR] case %s failed\n", kCases[i].name); + rc = 1; + break; + } + } + + if (stream != nullptr) + aclrtDestroyStream(stream); + aclrtResetDevice(deviceId); + aclFinalize(); + + return rc; +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowargmin/trowargmin.pto b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowargmin/trowargmin.pto new file mode 100644 index 000000000..0d54d7262 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowargmin/trowargmin.pto @@ -0,0 +1,89 @@ +// Auto-generated trowargmin ST testcases + +module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { + + func.func @TROWARGMIN_uint32_float_8x1_8x8_8x8(%src_ptr: !pto.ptr, %dst_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c8_r = arith.constant 8 : index + %c8_c = arith.constant 8 : index + %c64_se = arith.constant 64 : index + %c8_de = arith.constant 8 : index + %c1_dc = arith.constant 1 : index + %c8_vr = arith.constant 8 : index + %c8_vc = arith.constant 8 : index + %src_view = pto.make_tensor_view %src_ptr, + shape = [%c1, %c1, %c1, %c8_r, %c8_c], + strides = [%c64_se, %c64_se, %c64_se, %c8_c, %c1] + : !pto.tensor_view<1x1x1x8x8xf32> + %dst_view = pto.make_tensor_view %dst_ptr, + shape = [%c1, %c1, %c1, %c8_r, %c1_dc], + strides = [%c8_de, %c8_de, %c8_de, %c1_dc, %c1] + : !pto.tensor_view<1x1x1x8x1xui32> + %src_part = pto.partition_view %src_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c8_vr, %c8_vc] + : !pto.tensor_view<1x1x1x8x8xf32> -> !pto.partition_tensor_view<1x1x1x8x8xf32> + %dst_part = pto.partition_view %dst_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c8_vr, %c1] + : !pto.tensor_view<1x1x1x8x1xui32> -> !pto.partition_tensor_view<1x1x1x8x1xui32> + %src = pto.alloc_tile + : !pto.tile_buf + %tmp = pto.alloc_tile + : !pto.tile_buf + %dst = pto.alloc_tile + : !pto.tile_buf + pto.tload ins(%src_part : !pto.partition_tensor_view<1x1x1x8x8xf32>) + outs(%src : !pto.tile_buf) + pto.trowargmin ins(%src, %tmp : !pto.tile_buf, + !pto.tile_buf) + outs(%dst : !pto.tile_buf) + pto.tstore ins(%dst : !pto.tile_buf) + outs(%dst_part : !pto.partition_tensor_view<1x1x1x8x1xui32>) + return + } + + + func.func @TROWARGMIN_int32_half_16x1_13x16_13x13(%src_ptr: !pto.ptr, %dst_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c13_r = arith.constant 13 : index + %c16_c = arith.constant 16 : index + %c208_se = arith.constant 208 : index + %c13_de = arith.constant 13 : index + %c1_dc = arith.constant 1 : index + %c13_vr = arith.constant 13 : index + %c13_vc = arith.constant 13 : index + %src_view = pto.make_tensor_view %src_ptr, + shape = [%c1, %c1, %c1, %c13_r, %c16_c], + strides = [%c208_se, %c208_se, %c208_se, %c16_c, %c1] + : !pto.tensor_view<1x1x1x13x16xf16> + %dst_view = pto.make_tensor_view %dst_ptr, + shape = [%c1, %c1, %c1, %c13_r, %c1_dc], + strides = [%c13_de, %c13_de, %c13_de, %c1_dc, %c1] + : !pto.tensor_view<1x1x1x13x1xi32> + %src_part = pto.partition_view %src_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c13_vr, %c13_vc] + : !pto.tensor_view<1x1x1x13x16xf16> -> !pto.partition_tensor_view<1x1x1x13x13xf16> + %dst_part = pto.partition_view %dst_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c13_vr, %c1] + : !pto.tensor_view<1x1x1x13x1xi32> -> !pto.partition_tensor_view<1x1x1x13x1xi32> + %src = pto.alloc_tile + : !pto.tile_buf + %tmp = pto.alloc_tile + : !pto.tile_buf + %dst = pto.alloc_tile + : !pto.tile_buf + pto.tload ins(%src_part : !pto.partition_tensor_view<1x1x1x13x13xf16>) + outs(%src : !pto.tile_buf) + pto.trowargmin ins(%src, %tmp : !pto.tile_buf, + !pto.tile_buf) + outs(%dst : !pto.tile_buf) + pto.tstore ins(%dst : !pto.tile_buf) + outs(%dst_part : !pto.partition_tensor_view<1x1x1x13x1xi32>) + return + } +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpand/CMakeLists.txt b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpand/CMakeLists.txt new file mode 100644 index 000000000..efd5ec465 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpand/CMakeLists.txt @@ -0,0 +1,9 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +pto_tilelang_vec_st(trowexpand) diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpand/cases.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpand/cases.py new file mode 100644 index 000000000..4960b993a --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpand/cases.py @@ -0,0 +1,97 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +"""Single source of truth for trowexpand ST test cases. + +trowexpand is a row broadcast operation: expands a scalar per row to the entire row. +- Input shape: (rows, srcCols) - physical layout for NPU alignment +- srcCols = 32/sizeof(dtype) for 32-byte alignment +- Output shape: (rows, dstCols) - broadcast each scalar across the row +- dstValidCols may be less than dstCols for partial valid region + +Each case defines: + - name: case identifier, used as subdirectory name and by main.cpp kCases[]. + - dtype: numpy dtype (e.g. np.float32, np.float16, np.int8). + - src0_shape: (rows, srcCols) — physical input tile dimensions. + - src0_valid_shape: (valid_rows, 1) — effective input region. + - dst_shape: (rows, dstCols) — output tile dimensions. + - dst_valid_shape: (valid_rows, valid_cols) — effective output region. + - eps: tolerance for numpy.allclose (atol and rtol). +""" + +import numpy as np + +CASES = [ + # f32 cases (srcCols=8 for 32-byte alignment) + { + "name": "f32_16x128", + "dtype": np.float32, + "src0_shape": (16, 8), + "src0_valid_shape": (16, 1), + "dst_shape": (16, 128), + "dst_valid_shape": (16, 128), + "eps": 1e-6, + }, + { + "name": "f32_16x127", + "dtype": np.float32, + "src0_shape": (16, 8), + "src0_valid_shape": (16, 1), + "dst_shape": (16, 128), + "dst_valid_shape": (16, 127), # partial valid region + "eps": 1e-6, + }, + # f16 cases (srcCols=16 for 32-byte alignment) + { + "name": "f16_16x512", + "dtype": np.float16, + "src0_shape": (16, 16), + "src0_valid_shape": (16, 1), + "dst_shape": (16, 512), + "dst_valid_shape": (16, 512), + "eps": 1e-3, + }, + { + "name": "f16_16x511", + "dtype": np.float16, + "src0_shape": (16, 16), + "src0_valid_shape": (16, 1), + "dst_shape": (16, 512), + "dst_valid_shape": (16, 511), # partial valid region + "eps": 1e-3, + }, + # i8 cases (srcCols=32 for 32-byte alignment) + { + "name": "i8_16x256", + "dtype": np.int8, + "src0_shape": (16, 32), + "src0_valid_shape": (16, 1), + "dst_shape": (16, 256), + "dst_valid_shape": (16, 256), + "eps": 0, # exact match for integers + }, + { + "name": "i8_16x255", + "dtype": np.int8, + "src0_shape": (16, 32), + "src0_valid_shape": (16, 1), + "dst_shape": (16, 256), + "dst_valid_shape": (16, 255), # partial valid region + "eps": 0, + }, +] + +_SMOKE_CASE_NAMES = ['f32_16x128', 'f32_16x127'] +_SMOKE_CASE_NAME_SET = set(_SMOKE_CASE_NAMES) +_missing = [name for name in _SMOKE_CASE_NAMES if name not in {case["name"] for case in CASES}] +if _missing: + raise RuntimeError("unknown smoke case(s): " + ", ".join(_missing)) +CASES = [case for case in CASES if case["name"] in _SMOKE_CASE_NAME_SET] diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpand/compare.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpand/compare.py new file mode 100644 index 000000000..665f5d6ab --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpand/compare.py @@ -0,0 +1,64 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +"""Compare golden and output for trowexpand ST test cases. + +trowexpand: row broadcast operation. +Compare output (rows, cols) against golden (rows, cols). +""" + +import os +import sys +import numpy as np + +from cases import CASES +from st_common import result_cmp, style_fail, style_pass + +# Inline validation for multi-input format (trowexpand uses src0/dst only) +REQUIRED_KEYS = {"name", "dtype", "src0_shape", "src0_valid_shape", "dst_shape", "dst_valid_shape"} +for i, case in enumerate(CASES): + missing = REQUIRED_KEYS - case.keys() + if missing: + raise ValueError(f"cases[{i}] ({case.get('name', '?')}) missing keys: {missing}") + + +def main(): + case_filter = sys.argv[1] if len(sys.argv) > 1 else None + + all_passed = True + for case in CASES: + if case_filter is not None and case["name"] != case_filter: + continue + + case_dir = case["name"] + dst_shape = case["dst_shape"] + dst_valid_shape = case["dst_valid_shape"] + dtype = case["dtype"] + + vr, vc = dst_valid_shape + + golden = np.fromfile(os.path.join(case_dir, "golden.bin"), dtype=dtype).reshape(dst_shape) + output = np.fromfile(os.path.join(case_dir, "output.bin"), dtype=dtype).reshape(dst_shape) + + ok = result_cmp(golden[:vr, :vc], output[:vr, :vc], case["eps"]) + if ok: + print(style_pass(f"[INFO] {case['name']}: compare passed")) + else: + print(style_fail(f"[ERROR] {case['name']}: compare failed")) + all_passed = False + + if not all_passed: + sys.exit(2) + print(style_pass("[INFO] all cases passed")) + + +if __name__ == "__main__": + main() diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpand/gen_data.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpand/gen_data.py new file mode 100644 index 000000000..ad7981ef8 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpand/gen_data.py @@ -0,0 +1,52 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +"""Generate input and golden data for trowexpand ST test cases. + +trowexpand: row broadcast operation. +- Input: (rows, 1) - one scalar per row +- Output: (rows, cols) - broadcast each scalar across the entire row +""" + +import numpy as np +from cases import CASES +from st_common import setup_case_rng, save_case_data + +# Inline validation for multi-input format (trowexpand uses src0/dst only) +REQUIRED_KEYS = {"name", "dtype", "src0_shape", "src0_valid_shape", "dst_shape", "dst_valid_shape"} +for i, case in enumerate(CASES): + missing = REQUIRED_KEYS - case.keys() + if missing: + raise ValueError(f"cases[{i}] ({case.get('name', '?')}) missing keys: {missing}") + +for case in CASES: + setup_case_rng(case) + + dtype = case["dtype"] + src0_shape = case["src0_shape"] # Physical shape (rows, 8) + src0_valid_shape = case["src0_valid_shape"] # Valid shape (rows, 1) + dst_shape = case["dst_shape"] + dst_valid_shape = case["dst_valid_shape"] + + # Generate input: random values for each row's scalar, padded to 8 columns + # Physical layout: (rows, 8), but only column 0 is valid data + input_data = np.zeros(src0_shape, dtype=dtype) + src_vr = src0_valid_shape[0] + input_data[:src_vr, 0] = np.random.randint(1, 10, size=src_vr).astype(dtype) + + # Generate golden: broadcast each row's scalar across columns + # dst[i, :] = src[i, 0] for all columns + golden = np.zeros(dst_shape, dtype=dtype) + dst_vr, dst_vc = dst_valid_shape + golden[:dst_vr, :dst_vc] = np.broadcast_to(input_data[:src_vr, 0:1], (dst_vr, dst_vc)).astype(dtype, copy=False) + + save_case_data(case["name"], {"input": input_data, "golden": golden}) + print(f"[INFO] gen_data: {case['name']} src0_shape={src0_shape} dst_shape={dst_shape} dtype={dtype.__name__}") diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpand/launch.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpand/launch.cpp new file mode 100644 index 000000000..a1d48869d --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpand/launch.cpp @@ -0,0 +1,27 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +#include + +#ifndef AICORE +#define AICORE [aicore] +#endif + +// f32 kernels + +extern "C" __global__ AICORE void TROWEXPAND_f32_16x128(__gm__ float *src, __gm__ float *dst); +extern "C" __global__ AICORE void TROWEXPAND_f32_16x127(__gm__ float *src, __gm__ float *dst); + +void LaunchTROWEXPAND_f32_16x128(float *src, float *dst, void *stream) { + TROWEXPAND_f32_16x128<<<1, nullptr, stream>>>((__gm__ float *)src, (__gm__ float *)dst); +} + + +void LaunchTROWEXPAND_f32_16x127(float *src, float *dst, void *stream) { + TROWEXPAND_f32_16x127<<<1, nullptr, stream>>>((__gm__ float *)src, (__gm__ float *)dst); +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpand/main.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpand/main.cpp new file mode 100644 index 000000000..f53d8bdfa --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpand/main.cpp @@ -0,0 +1,137 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// Host driver for TileLang trowexpand ST — row broadcast operation. +// Supports multiple data types: f32, f16, i8 + +#include "acl/acl.h" +#include "test_common.h" +#include +#include +#include +#include +#include +#include + +using namespace PtoTestCommon; + +// Kernel launch wrappers (defined in launch.cpp) +// f32 +void LaunchTROWEXPAND_f32_16x128(float *src, float *dst, void *stream); +void LaunchTROWEXPAND_f32_16x127(float *src, float *dst, void *stream); +// f16 +void LaunchTROWEXPAND_f16_16x511(void *src, void *dst, void *stream); +// i8 +void LaunchTROWEXPAND_i8_16x255(void *src, void *dst, void *stream); + +// Generic launch function type +using LaunchFn = void (*)(void *, void *, void *); + +struct TestCase { + const char *name; + LaunchFn launch; + size_t srcRows; + size_t srcCols; // srcCols = 32/sizeof(dtype) for alignment + size_t dstRows; + size_t dstCols; + size_t dstValidCols; // effective output columns + size_t elemSize; // bytes per element +}; + +static const TestCase kCases[] = { + // f32: srcCols=8 (32/4), dstCols=128, dstValidCols=128 or 127 +{"f32_16x128", (LaunchFn)LaunchTROWEXPAND_f32_16x128, 16, 8, 16, 128, 128, sizeof(float)}, +{"f32_16x127", (LaunchFn)LaunchTROWEXPAND_f32_16x127, 16, 8, 16, 128, 127, sizeof(float)}, +}; +static constexpr size_t kNumCases = sizeof(kCases) / sizeof(kCases[0]); + +static int RunCase(const TestCase &tc, int deviceId, aclrtStream stream) { + int rc = 0; + size_t srcFileSize = tc.srcRows * tc.srcCols * tc.elemSize; + const size_t dstFileSize = tc.dstRows * tc.dstCols * tc.elemSize; + + std::printf("[INFO] === case: %s (src=%zux%zu, dst=%zux%zu, valid_cols=%zu) ===\n", + tc.name, tc.srcRows, tc.srcCols, tc.dstRows, tc.dstCols, tc.dstValidCols); + + std::string caseDir = std::string("./") + tc.name; + + void *srcHost = nullptr, *dstHost = nullptr; + void *srcDevice = nullptr, *dstDevice = nullptr; + + aclrtMallocHost((void **)(&srcHost), srcFileSize); + aclrtMallocHost((void **)(&dstHost), dstFileSize); + + aclrtMalloc((void **)&srcDevice, srcFileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc((void **)&dstDevice, dstFileSize, ACL_MEM_MALLOC_HUGE_FIRST); + + if (!ReadFile((caseDir + "/input.bin").c_str(), srcFileSize, srcHost, srcFileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input.bin\n", caseDir.c_str()); + rc = 1; + } + + if (rc == 0) { + aclrtMemcpy(srcDevice, srcFileSize, srcHost, srcFileSize, ACL_MEMCPY_HOST_TO_DEVICE); + + tc.launch(srcDevice, dstDevice, stream); + + aclrtSynchronizeStream(stream); + aclrtMemcpy(dstHost, dstFileSize, dstDevice, dstFileSize, ACL_MEMCPY_DEVICE_TO_HOST); + } + + if (rc == 0 && !WriteFile((caseDir + "/output.bin").c_str(), dstHost, dstFileSize)) { + std::fprintf(stderr, "[ERROR] failed to write %s/output.bin\n", caseDir.c_str()); + rc = 1; + } + + if (srcDevice != nullptr) + aclrtFree(srcDevice); + if (dstDevice != nullptr) + aclrtFree(dstDevice); + if (srcHost != nullptr) + aclrtFreeHost(srcHost); + if (dstHost != nullptr) + aclrtFreeHost(dstHost); + + if (rc == 0) + std::printf("[INFO] case %s done\n", tc.name); + return rc; +} + +int main(int argc, char *argv[]) { + const char *caseFilter = (argc > 1) ? argv[1] : nullptr; + + int rc = 0; + int deviceId = 0; + aclrtStream stream = nullptr; + + aclInit(nullptr); + if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { + deviceId = std::atoi(envDevice); + } + aclrtSetDevice(deviceId); + aclrtCreateStream(&stream); + + for (size_t i = 0; i < kNumCases; ++i) { + if (caseFilter != nullptr && std::strcmp(kCases[i].name, caseFilter) != 0) { + continue; + } + int ret = RunCase(kCases[i], deviceId, stream); + if (ret != 0) { + std::fprintf(stderr, "[ERROR] case %s failed\n", kCases[i].name); + rc = 1; + break; + } + } + + if (stream != nullptr) + aclrtDestroyStream(stream); + aclrtResetDevice(deviceId); + aclFinalize(); + + return rc; +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpand/trowexpand.pto b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpand/trowexpand.pto new file mode 100644 index 000000000..06ad61630 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpand/trowexpand.pto @@ -0,0 +1,103 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You can not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// TileLang ST kernels for pto.trowexpand: row broadcast operation. +// dst[row, col] = src[row, 0] (broadcast scalar per row) +// srcCols = 32/sizeof(dtype) for NPU 32-byte alignment + +module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { + // f32_16x128: rows=16, srcCols=8, dstValidCols=128, dstCols=128 + func.func @TROWEXPAND_f32_16x128(%src_ptr: !pto.ptr, %dst_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c8 = arith.constant 8 : index + %c16 = arith.constant 16 : index + %c128 = arith.constant 128 : index + %c2048 = arith.constant 2048 : index + + %src_view = pto.make_tensor_view %src_ptr, + shape = [%c1, %c1, %c1, %c16, %c8], + strides = [%c128, %c128, %c128, %c8, %c1] + : !pto.tensor_view<1x1x1x16x8xf32> + %dst_view = pto.make_tensor_view %dst_ptr, + shape = [%c1, %c1, %c1, %c16, %c128], + strides = [%c2048, %c2048, %c2048, %c128, %c1] + : !pto.tensor_view<1x1x1x16x128xf32> + + %src_part = pto.partition_view %src_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c16, %c8] + : !pto.tensor_view<1x1x1x16x8xf32> -> !pto.partition_tensor_view<1x1x1x16x8xf32> + %dst_part = pto.partition_view %dst_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c16, %c128] + : !pto.tensor_view<1x1x1x16x128xf32> -> !pto.partition_tensor_view<1x1x1x16x128xf32> + + %src = pto.alloc_tile + : !pto.tile_buf + %dst = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%src_part : !pto.partition_tensor_view<1x1x1x16x8xf32>) + outs(%src : !pto.tile_buf) + + pto.trowexpand ins(%src : !pto.tile_buf) + outs(%dst : !pto.tile_buf) + + pto.tstore ins(%dst : !pto.tile_buf) + outs(%dst_part : !pto.partition_tensor_view<1x1x1x16x128xf32>) + return + } + + // f32_16x127: rows=16, srcCols=8, dstValidCols=127, dstCols=128 + + func.func @TROWEXPAND_f32_16x127(%src_ptr: !pto.ptr, %dst_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c8 = arith.constant 8 : index + %c16 = arith.constant 16 : index + %c127 = arith.constant 127 : index + %c128 = arith.constant 128 : index + %c2048 = arith.constant 2048 : index + + %src_view = pto.make_tensor_view %src_ptr, + shape = [%c1, %c1, %c1, %c16, %c8], + strides = [%c128, %c128, %c128, %c8, %c1] + : !pto.tensor_view<1x1x1x16x8xf32> + %dst_view = pto.make_tensor_view %dst_ptr, + shape = [%c1, %c1, %c1, %c16, %c128], + strides = [%c2048, %c2048, %c2048, %c128, %c1] + : !pto.tensor_view<1x1x1x16x128xf32> + + %src_part = pto.partition_view %src_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c16, %c8] + : !pto.tensor_view<1x1x1x16x8xf32> -> !pto.partition_tensor_view<1x1x1x16x8xf32> + %dst_part = pto.partition_view %dst_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c16, %c127] + : !pto.tensor_view<1x1x1x16x128xf32> -> !pto.partition_tensor_view<1x1x1x16x127xf32> + + %src = pto.alloc_tile + : !pto.tile_buf + %dst = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%src_part : !pto.partition_tensor_view<1x1x1x16x8xf32>) + outs(%src : !pto.tile_buf) + + pto.trowexpand ins(%src : !pto.tile_buf) + outs(%dst : !pto.tile_buf) + + pto.tstore ins(%dst : !pto.tile_buf) + outs(%dst_part : !pto.partition_tensor_view<1x1x1x16x127xf32>) + return + } + + // f16_16x512: rows=16, srcCols=16, dstValidCols=512, dstCols=512 +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandadd/CMakeLists.txt b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandadd/CMakeLists.txt new file mode 100644 index 000000000..5e45f3de4 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandadd/CMakeLists.txt @@ -0,0 +1,9 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +pto_tilelang_vec_st(trowexpandadd) diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandadd/cases.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandadd/cases.py new file mode 100644 index 000000000..8535b2eb0 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandadd/cases.py @@ -0,0 +1,123 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +"""Single source of truth for trowexpandadd ST test cases. + +trowexpandadd: dst = src0 + broadcast(src1) across columns. +- src1Col determines how src1 is broadcast: + - src1Col=1: only first column is valid, broadcast to dstCols + - src1Col=8 (for f32): 8 columns are valid, no broadcast needed +- src1Cols (physical) = 32/sizeof(dtype) for NPU alignment + +Template parameters: + - dstRow, dstCol: dst shape + - src1Row, src1Col: src1 shape (src1Col is valid columns, not physical) + - src0eqdst: true means src0 shape equals dst, false means different +""" + +import numpy as np + +CASES = [ + # launchTRowExpandAdd + { + "name": "f32_16x32", + "dtype": np.float32, + "src0_shape": (16, 32), # src0eqdst=true, same as dst + "src0_valid_shape": (16, 32), + "src1_shape": (16, 8), # physical: 32/sizeof(f32)=8 + "src1_valid_shape": (16, 1), # src1Col=1, only first column valid + "dst_shape": (16, 32), + "dst_valid_shape": (16, 32), + "eps": 1e-6, + }, + # launchTRowExpandAdd + { + "name": "f32_56x128", + "dtype": np.float32, + "src0_shape": (56, 128), # src0eqdst=true + "src0_valid_shape": (56, 128), + "src1_shape": (56, 8), # physical: 8 + "src1_valid_shape": (56, 1), # src1Col=1 + "dst_shape": (56, 128), + "dst_valid_shape": (56, 128), + "eps": 1e-6, + }, + # launchTRowExpandAdd + { + "name": "f16_48x64", + "dtype": np.float16, + "src0_shape": (48, 64), # src0eqdst=true + "src0_valid_shape": (48, 64), + "src1_shape": (48, 16), # physical: 32/sizeof(f16)=16 + "src1_valid_shape": (48, 1), # src1Col=1 + "dst_shape": (48, 64), + "dst_valid_shape": (48, 64), + "eps": 1e-3, + }, + # launchTRowExpandAdd + { + "name": "f16_16x128", + "dtype": np.float16, + "src0_shape": (16, 128), # src0eqdst=true + "src0_valid_shape": (16, 128), + "src1_shape": (16, 16), # physical: 16 + "src1_valid_shape": (16, 1), # src1Col=1 + "dst_shape": (16, 128), + "dst_valid_shape": (16, 128), + "eps": 1e-3, + }, + # Note: launchTRowExpandAdd2 with src1Col=8 has different semantics - TBD + # launchTRowExpandAdd2 - needs investigation + # launchTRowExpandAdd2 - needs investigation + # launchTRowExpandAdd + { + "name": "f16_32x64_noeq", + "dtype": np.float16, + "src0_shape": (32, 64), # src0eqdst=false, but src0 shape still matches dst + "src0_valid_shape": (32, 64), + "src1_shape": (32, 16), # physical: 16 + "src1_valid_shape": (32, 1), # src1Col=1 + "dst_shape": (32, 64), + "dst_valid_shape": (32, 64), + "eps": 1e-3, + }, + # launchTRowExpandAdd + { + "name": "i32_16x32", + "dtype": np.int32, + "src0_shape": (16, 32), # src0eqdst=true + "src0_valid_shape": (16, 32), + "src1_shape": (16, 8), # physical: 32/sizeof(i32)=8 + "src1_valid_shape": (16, 1), # src1Col=1 + "dst_shape": (16, 32), + "dst_valid_shape": (16, 32), + "eps": 0, + }, + # launchTRowExpandAdd + { + "name": "i16_16x64", + "dtype": np.int16, + "src0_shape": (16, 64), # src0eqdst=true + "src0_valid_shape": (16, 64), + "src1_shape": (16, 16), # physical: 32/sizeof(i16)=16 + "src1_valid_shape": (16, 1), # src1Col=1 + "dst_shape": (16, 64), + "dst_valid_shape": (16, 64), + "eps": 0, + }, +] + +_SMOKE_CASE_NAMES = ['f32_16x32', 'i32_16x32'] +_SMOKE_CASE_NAME_SET = set(_SMOKE_CASE_NAMES) +_missing = [name for name in _SMOKE_CASE_NAMES if name not in {case["name"] for case in CASES}] +if _missing: + raise RuntimeError("unknown smoke case(s): " + ", ".join(_missing)) +CASES = [case for case in CASES if case["name"] in _SMOKE_CASE_NAME_SET] diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandadd/compare.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandadd/compare.py new file mode 100644 index 000000000..3536ecad5 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandadd/compare.py @@ -0,0 +1,61 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +"""Compare golden and output for trowexpandadd ST test cases.""" + +import os +import sys +import numpy as np + +from cases import CASES +from st_common import result_cmp, style_fail, style_pass + +# Inline validation for multi-input format (trowexpandadd uses src0/src1/dst) +REQUIRED_KEYS = {"name", "dtype", "src0_shape", "src0_valid_shape", "src1_shape", + "src1_valid_shape", "dst_shape", "dst_valid_shape"} +for i, case in enumerate(CASES): + missing = REQUIRED_KEYS - case.keys() + if missing: + raise ValueError(f"cases[{i}] ({case.get('name', '?')}) missing keys: {missing}") + + +def main(): + case_filter = sys.argv[1] if len(sys.argv) > 1 else None + + all_passed = True + for case in CASES: + if case_filter is not None and case["name"] != case_filter: + continue + + case_dir = case["name"] + dst_shape = case["dst_shape"] + dst_valid_shape = case["dst_valid_shape"] + dtype = case["dtype"] + + vr, vc = dst_valid_shape + + golden = np.fromfile(os.path.join(case_dir, "golden.bin"), dtype=dtype).reshape(dst_shape) + output = np.fromfile(os.path.join(case_dir, "output.bin"), dtype=dtype).reshape(dst_shape) + + ok = result_cmp(golden[:vr, :vc], output[:vr, :vc], case["eps"]) + if ok: + print(style_pass(f"[INFO] {case['name']}: compare passed")) + else: + print(style_fail(f"[ERROR] {case['name']}: compare failed")) + all_passed = False + + if not all_passed: + sys.exit(2) + print(style_pass("[INFO] all cases passed")) + + +if __name__ == "__main__": + main() diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandadd/gen_data.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandadd/gen_data.py new file mode 100644 index 000000000..65f746382 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandadd/gen_data.py @@ -0,0 +1,69 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +"""Generate input and golden data for trowexpandadd ST test cases. + +trowexpandadd: dst = src0 + broadcast(src1) across columns. +- src1Col=1: only first column valid, broadcast to all dst columns +- src1Col>1: each src1 column maps to a block of dst columns (dstCol/src1Col) +""" + +import numpy as np +from cases import CASES +from st_common import setup_case_rng, save_case_data + +# Inline validation for multi-input format (trowexpandadd uses src0/src1/dst) +REQUIRED_KEYS = {"name", "dtype", "src0_shape", "src0_valid_shape", "src1_shape", + "src1_valid_shape", "dst_shape", "dst_valid_shape"} +for i, case in enumerate(CASES): + missing = REQUIRED_KEYS - case.keys() + if missing: + raise ValueError(f"cases[{i}] ({case.get('name', '?')}) missing keys: {missing}") + +for case in CASES: + setup_case_rng(case) + + dtype = case["dtype"] + src0_shape = case["src0_shape"] + src0_valid_shape = case["src0_valid_shape"] + src1_shape = case["src1_shape"] + src1_valid_shape = case["src1_valid_shape"] + dst_shape = case["dst_shape"] + dst_valid_shape = case["dst_valid_shape"] + + # Generate inputs + input1 = np.random.randint(1, 10, size=src0_shape).astype(dtype) # src0 matrix + input2 = np.random.randint(1, 10, size=src1_shape).astype(dtype) # src1 row vectors + + # Generate golden: dst = src0 + broadcast(src1) + golden = np.zeros(dst_shape, dtype=dtype) + dst_vr, dst_vc = dst_valid_shape + src0_vr, src0_vc = src0_valid_shape + src1_vr, src1_vc = src1_valid_shape + + if src1_vc == 1: + # src1Col=1: broadcast first column to all dst columns + golden[:dst_vr, :dst_vc] = ( + input1[:src0_vr, :src0_vc] + input2[:src1_vr, 0:1] + ).astype(dtype, copy=False) + else: + # src1Col>1: each src1 column maps to dstCol/src1_vc columns + # dst[:, block*repeat:(block+1)*repeat] = src0 + src1[:, block:block+1] + repeat = dst_vc // src1_vc + for block in range(src1_vc): + start_col = block * repeat + end_col = min((block + 1) * repeat, dst_vc) + golden[:dst_vr, start_col:end_col] = ( + input1[:src0_vr, start_col:end_col] + input2[:src1_vr, block:block+1] + ).astype(dtype, copy=False) + + save_case_data(case["name"], {"input1": input1, "input2": input2, "golden": golden}) + print(f"[INFO] gen_data: {case['name']} src0={src0_shape} src1={src1_shape} dst={dst_shape} dtype={dtype.__name__}") diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandadd/launch.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandadd/launch.cpp new file mode 100644 index 000000000..741e216ee --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandadd/launch.cpp @@ -0,0 +1,27 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +#include + +#ifndef AICORE +#define AICORE [aicore] +#endif + +// f32 kernels + +extern "C" __global__ AICORE void TROWEXPANDADD_f32_16x32(__gm__ float *src0, __gm__ float *src1, __gm__ float *dst); +extern "C" __global__ AICORE void TROWEXPANDADD_i32_16x32(__gm__ int32_t *src0, __gm__ int32_t *src1, __gm__ int32_t *dst); + +void LaunchTROWEXPANDADD_f32_16x32(float *src0, float *src1, float *dst, void *stream) { + TROWEXPANDADD_f32_16x32<<<1, nullptr, stream>>>((__gm__ float *)src0, (__gm__ float *)src1, (__gm__ float *)dst); +} + + +void LaunchTROWEXPANDADD_i32_16x32(void *src0, void *src1, void *dst, void *stream) { + TROWEXPANDADD_i32_16x32<<<1, nullptr, stream>>>((__gm__ int32_t *)src0, (__gm__ int32_t *)src1, (__gm__ int32_t *)dst); +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandadd/main.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandadd/main.cpp new file mode 100644 index 000000000..1515c5373 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandadd/main.cpp @@ -0,0 +1,154 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// Host driver for TileLang trowexpandadd ST — row-wise broadcast addition. +// Supports multiple data types: f32, f16, i32, i16 + +#include "acl/acl.h" +#include "test_common.h" +#include +#include +#include +#include +#include +#include + +using namespace PtoTestCommon; + +// Kernel launch wrappers (defined in launch.cpp) +// f32 +void LaunchTROWEXPANDADD_f32_16x32(float *src0, float *src1, float *dst, void *stream); +void LaunchTROWEXPANDADD_f32_56x128(float *src0, float *src1, float *dst, void *stream); +// f16 (use void* for aclFloat16) +void LaunchTROWEXPANDADD_f16_16x128(void *src0, void *src1, void *dst, void *stream); +// i32 +void LaunchTROWEXPANDADD_i32_16x32(void *src0, void *src1, void *dst, void *stream); +// i16 + +// Note: launchTRowExpandAdd2 with src1Col=8 has different semantics - TBD + +// Generic launch function type +using LaunchFn = void (*)(void *, void *, void *, void *); + +struct TestCase { + const char *name; + LaunchFn launch; + size_t src0Rows; + size_t src0Cols; + size_t src1Rows; + size_t src1Cols; // physical src1 cols = 32/sizeof(dtype) + size_t dstRows; + size_t dstCols; + size_t dstValidCols; // effective dst cols + size_t elemSize; +}; + +static const TestCase kCases[] = { + // f32 cases +{"f32_16x32", (LaunchFn)LaunchTROWEXPANDADD_f32_16x32, 16, 32, 16, 8, 16, 32, 32, sizeof(float)}, +{"i32_16x32", LaunchTROWEXPANDADD_i32_16x32, 16, 32, 16, 8, 16, 32, 32, sizeof(int32_t)}, +}; +static constexpr size_t kNumCases = sizeof(kCases) / sizeof(kCases[0]); + +static int RunCase(const TestCase &tc, int deviceId, aclrtStream stream) { + int rc = 0; + size_t src0FileSize = tc.src0Rows * tc.src0Cols * tc.elemSize; + size_t src1FileSize = tc.src1Rows * tc.src1Cols * tc.elemSize; + size_t dstFileSize = tc.dstRows * tc.dstCols * tc.elemSize; + + std::printf("[INFO] === case: %s (src0=%zux%zu, src1=%zux%zu, dst=%zux%zu) ===\n", + tc.name, tc.src0Rows, tc.src0Cols, tc.src1Rows, tc.src1Cols, tc.dstRows, tc.dstCols); + + std::string caseDir = std::string("./") + tc.name; + + void *src0Host = nullptr, *src1Host = nullptr, *dstHost = nullptr; + void *src0Device = nullptr, *src1Device = nullptr, *dstDevice = nullptr; + + aclrtMallocHost((void **)(&src0Host), src0FileSize); + aclrtMallocHost((void **)(&src1Host), src1FileSize); + aclrtMallocHost((void **)(&dstHost), dstFileSize); + + aclrtMalloc((void **)&src0Device, src0FileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc((void **)&src1Device, src1FileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc((void **)&dstDevice, dstFileSize, ACL_MEM_MALLOC_HUGE_FIRST); + + if (!ReadFile((caseDir + "/input1.bin").c_str(), src0FileSize, src0Host, src0FileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input1.bin\n", caseDir.c_str()); + rc = 1; + } + if (rc == 0 && !ReadFile((caseDir + "/input2.bin").c_str(), src1FileSize, src1Host, src1FileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input2.bin\n", caseDir.c_str()); + rc = 1; + } + + if (rc == 0) { + aclrtMemcpy(src0Device, src0FileSize, src0Host, src0FileSize, ACL_MEMCPY_HOST_TO_DEVICE); + aclrtMemcpy(src1Device, src1FileSize, src1Host, src1FileSize, ACL_MEMCPY_HOST_TO_DEVICE); + + tc.launch(src0Device, src1Device, dstDevice, stream); + + aclrtSynchronizeStream(stream); + aclrtMemcpy(dstHost, dstFileSize, dstDevice, dstFileSize, ACL_MEMCPY_DEVICE_TO_HOST); + } + + if (rc == 0 && !WriteFile((caseDir + "/output.bin").c_str(), dstHost, dstFileSize)) { + std::fprintf(stderr, "[ERROR] failed to write %s/output.bin\n", caseDir.c_str()); + rc = 1; + } + + if (src0Device != nullptr) + aclrtFree(src0Device); + if (src1Device != nullptr) + aclrtFree(src1Device); + if (dstDevice != nullptr) + aclrtFree(dstDevice); + if (src0Host != nullptr) + aclrtFreeHost(src0Host); + if (src1Host != nullptr) + aclrtFreeHost(src1Host); + if (dstHost != nullptr) + aclrtFreeHost(dstHost); + + if (rc == 0) + std::printf("[INFO] case %s done\n", tc.name); + return rc; +} + +int main(int argc, char *argv[]) { + const char *caseFilter = (argc > 1) ? argv[1] : nullptr; + + int rc = 0; + int deviceId = 0; + aclrtStream stream = nullptr; + + aclInit(nullptr); + if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { + deviceId = std::atoi(envDevice); + } + aclrtSetDevice(deviceId); + aclrtCreateStream(&stream); + + for (size_t i = 0; i < kNumCases; ++i) { + if (caseFilter != nullptr && std::strcmp(kCases[i].name, caseFilter) != 0) { + continue; + } + int ret = RunCase(kCases[i], deviceId, stream); + if (ret != 0) { + std::fprintf(stderr, "[ERROR] case %s failed\n", kCases[i].name); + rc = 1; + break; + } + } + + if (stream != nullptr) + aclrtDestroyStream(stream); + aclrtResetDevice(deviceId); + aclFinalize(); + + return rc; +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandadd/trowexpandadd.pto b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandadd/trowexpandadd.pto new file mode 100644 index 000000000..8117451eb --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandadd/trowexpandadd.pto @@ -0,0 +1,131 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You can not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// TileLang ST kernels for pto.trowexpandadd: row-wise broadcast addition. +// dst = src0 + broadcast(src1) where src1 is expanded across columns. +// src1 physical cols = 32/sizeof(dtype) for NPU alignment +// src1 v_col = src1Col from template (1 or 8) + +module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { + // f32_16x32: dstRow=16, dstCol=32, src1Row=16, src1Col=1 + func.func @TROWEXPANDADD_f32_16x32(%src0_ptr: !pto.ptr, %src1_ptr: !pto.ptr, %dst_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c8 = arith.constant 8 : index + %c16 = arith.constant 16 : index + %c32 = arith.constant 32 : index + %c128 = arith.constant 128 : index + %c512 = arith.constant 512 : index + + %src0_view = pto.make_tensor_view %src0_ptr, + shape = [%c1, %c1, %c1, %c16, %c32], + strides = [%c512, %c512, %c512, %c32, %c1] + : !pto.tensor_view<1x1x1x16x32xf32> + %src1_view = pto.make_tensor_view %src1_ptr, + shape = [%c1, %c1, %c1, %c16, %c8], + strides = [%c128, %c128, %c128, %c8, %c1] + : !pto.tensor_view<1x1x1x16x8xf32> + %dst_view = pto.make_tensor_view %dst_ptr, + shape = [%c1, %c1, %c1, %c16, %c32], + strides = [%c512, %c512, %c512, %c32, %c1] + : !pto.tensor_view<1x1x1x16x32xf32> + + %src0_part = pto.partition_view %src0_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c16, %c32] + : !pto.tensor_view<1x1x1x16x32xf32> -> !pto.partition_tensor_view<1x1x1x16x32xf32> + %src1_part = pto.partition_view %src1_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c16, %c8] + : !pto.tensor_view<1x1x1x16x8xf32> -> !pto.partition_tensor_view<1x1x1x16x8xf32> + %dst_part = pto.partition_view %dst_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c16, %c32] + : !pto.tensor_view<1x1x1x16x32xf32> -> !pto.partition_tensor_view<1x1x1x16x32xf32> + + %src0 = pto.alloc_tile + : !pto.tile_buf + %src1 = pto.alloc_tile + : !pto.tile_buf + %dst = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%src0_part : !pto.partition_tensor_view<1x1x1x16x32xf32>) + outs(%src0 : !pto.tile_buf) + pto.tload ins(%src1_part : !pto.partition_tensor_view<1x1x1x16x8xf32>) + outs(%src1 : !pto.tile_buf) + + pto.trowexpandadd ins(%src0, %src1 : !pto.tile_buf, + !pto.tile_buf) + outs(%dst : !pto.tile_buf) + + pto.tstore ins(%dst : !pto.tile_buf) + outs(%dst_part : !pto.partition_tensor_view<1x1x1x16x32xf32>) + return + } + + // f32_56x128: dstRow=56, dstCol=128, src1Row=56, src1Col=1 + + func.func @TROWEXPANDADD_i32_16x32(%src0_ptr: !pto.ptr, %src1_ptr: !pto.ptr, %dst_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c8 = arith.constant 8 : index + %c16 = arith.constant 16 : index + %c32 = arith.constant 32 : index + %c128 = arith.constant 128 : index + %c512 = arith.constant 512 : index + + %src0_view = pto.make_tensor_view %src0_ptr, + shape = [%c1, %c1, %c1, %c16, %c32], + strides = [%c512, %c512, %c512, %c32, %c1] + : !pto.tensor_view<1x1x1x16x32xi32> + %src1_view = pto.make_tensor_view %src1_ptr, + shape = [%c1, %c1, %c1, %c16, %c8], + strides = [%c128, %c128, %c128, %c8, %c1] + : !pto.tensor_view<1x1x1x16x8xi32> + %dst_view = pto.make_tensor_view %dst_ptr, + shape = [%c1, %c1, %c1, %c16, %c32], + strides = [%c512, %c512, %c512, %c32, %c1] + : !pto.tensor_view<1x1x1x16x32xi32> + + %src0_part = pto.partition_view %src0_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c16, %c32] + : !pto.tensor_view<1x1x1x16x32xi32> -> !pto.partition_tensor_view<1x1x1x16x32xi32> + %src1_part = pto.partition_view %src1_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c16, %c8] + : !pto.tensor_view<1x1x1x16x8xi32> -> !pto.partition_tensor_view<1x1x1x16x8xi32> + %dst_part = pto.partition_view %dst_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c16, %c32] + : !pto.tensor_view<1x1x1x16x32xi32> -> !pto.partition_tensor_view<1x1x1x16x32xi32> + + %src0 = pto.alloc_tile + : !pto.tile_buf + %src1 = pto.alloc_tile + : !pto.tile_buf + %dst = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%src0_part : !pto.partition_tensor_view<1x1x1x16x32xi32>) + outs(%src0 : !pto.tile_buf) + pto.tload ins(%src1_part : !pto.partition_tensor_view<1x1x1x16x8xi32>) + outs(%src1 : !pto.tile_buf) + + pto.trowexpandadd ins(%src0, %src1 : !pto.tile_buf, + !pto.tile_buf) + outs(%dst : !pto.tile_buf) + + pto.tstore ins(%dst : !pto.tile_buf) + outs(%dst_part : !pto.partition_tensor_view<1x1x1x16x32xi32>) + return + } + + // i16_16x64: dstRow=16, dstCol=64, src1Row=16, src1Col=1 +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpanddiv/CMakeLists.txt b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpanddiv/CMakeLists.txt new file mode 100644 index 000000000..d181bdb94 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpanddiv/CMakeLists.txt @@ -0,0 +1,9 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +pto_tilelang_vec_st(trowexpanddiv) diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpanddiv/cases.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpanddiv/cases.py new file mode 100644 index 000000000..4f8462e52 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpanddiv/cases.py @@ -0,0 +1,137 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +"""Single source of truth for trowexpanddiv ST test cases. + +trowexpanddiv: dst = src0 / broadcast(src1) across columns. +- src1Col determines how src1 is broadcast: + - src1Col=1: only first column is valid, broadcast to dstCols + - src1Col>1: each src1 column maps to a block of dst columns (dstCol/src1Col columns per src1 value) +- src1 physical cols = 32/sizeof(dtype) for NPU alignment +- highPrecision: use high precision mode for computation +""" + +import numpy as np + +CASES = [ + # launchTRowExpandDiv + { + "name": "f16_16x32", + "dtype": np.float16, + "src0_shape": (16, 32), + "src0_valid_shape": (16, 32), + "src1_shape": (16, 16), # physical: 32/sizeof(f16)=16 + "src1_valid_shape": (16, 1), + "dst_shape": (16, 32), + "dst_valid_shape": (16, 32), + "eps": 1e-3, + "high_precision": False, + }, + # launchTRowExpandDiv + { + "name": "f32_40x64", + "dtype": np.float32, + "src0_shape": (40, 64), # src0eqdst=true + "src0_valid_shape": (40, 64), + "src1_shape": (40, 8), # physical: 32/sizeof(f32)=8 + "src1_valid_shape": (40, 1), # src1Col=1 + "dst_shape": (40, 64), + "dst_valid_shape": (40, 64), + "eps": 1e-6, + "high_precision": False, + }, + # launchTRowExpandDiv + { + "name": "f32_16x256", + "dtype": np.float32, + "src0_shape": (16, 256), + "src0_valid_shape": (16, 256), + "src1_shape": (16, 8), + "src1_valid_shape": (16, 1), + "dst_shape": (16, 256), + "dst_valid_shape": (16, 256), + "eps": 1e-6, + "high_precision": False, + }, + # launchTRowExpandDiv + { + "name": "f16_32x512", + "dtype": np.float16, + "src0_shape": (32, 512), + "src0_valid_shape": (32, 512), + "src1_shape": (32, 16), + "src1_valid_shape": (32, 1), + "dst_shape": (32, 512), + "dst_valid_shape": (32, 512), + "eps": 1e-3, + "high_precision": False, + }, + # launchTRowExpandDiv + { + "name": "f32_16x128_noeq", + "dtype": np.float32, + "src0_shape": (16, 128), # src0eqdst=false + "src0_valid_shape": (16, 128), + "src1_shape": (16, 8), + "src1_valid_shape": (16, 1), + "dst_shape": (16, 128), + "dst_valid_shape": (16, 128), + "eps": 1e-6, + "high_precision": False, + }, + # launchTRowExpandDiv + { + "name": "f16_32x64_noeq", + "dtype": np.float16, + "src0_shape": (32, 64), + "src0_valid_shape": (32, 64), + "src1_shape": (32, 16), + "src1_valid_shape": (32, 1), + "dst_shape": (32, 64), + "dst_valid_shape": (32, 64), + "eps": 1e-3, + "high_precision": False, + }, + # launchTRowExpandDiv + { + "name": "f32_40x32_hp", + "dtype": np.float32, + "src0_shape": (40, 32), + "src0_valid_shape": (40, 32), + "src1_shape": (40, 8), + "src1_valid_shape": (40, 1), + "dst_shape": (40, 32), + "dst_valid_shape": (40, 32), + "eps": 1e-6, + "high_precision": True, + }, + # launchTRowExpandDiv + { + "name": "f16_16x128_hp", + "dtype": np.float16, + "src0_shape": (16, 128), + "src0_valid_shape": (16, 128), + "src1_shape": (16, 16), + "src1_valid_shape": (16, 1), + "dst_shape": (16, 128), + "dst_valid_shape": (16, 128), + "eps": 1e-3, + "high_precision": True, + }, + # Note: launchTRowExpandDiv2 with src1Col>1 has different semantics - TBD +] + +_SMOKE_CASE_NAMES = ['f16_16x32', 'f32_40x32_hp'] +_SMOKE_CASE_NAME_SET = set(_SMOKE_CASE_NAMES) +_missing = [name for name in _SMOKE_CASE_NAMES if name not in {case["name"] for case in CASES}] +if _missing: + raise RuntimeError("unknown smoke case(s): " + ", ".join(_missing)) +CASES = [case for case in CASES if case["name"] in _SMOKE_CASE_NAME_SET] diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpanddiv/compare.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpanddiv/compare.py new file mode 100644 index 000000000..4a3b5edc3 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpanddiv/compare.py @@ -0,0 +1,61 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +"""Compare golden and output for trowexpanddiv ST test cases.""" + +import os +import sys +import numpy as np + +from cases import CASES +from st_common import result_cmp, style_fail, style_pass + +# Inline validation for multi-input format (trowexpanddiv uses src0/src1/dst) +REQUIRED_KEYS = {"name", "dtype", "src0_shape", "src0_valid_shape", "src1_shape", + "src1_valid_shape", "dst_shape", "dst_valid_shape"} +for i, case in enumerate(CASES): + missing = REQUIRED_KEYS - case.keys() + if missing: + raise ValueError(f"cases[{i}] ({case.get('name', '?')}) missing keys: {missing}") + + +def main(): + case_filter = sys.argv[1] if len(sys.argv) > 1 else None + + all_passed = True + for case in CASES: + if case_filter is not None and case["name"] != case_filter: + continue + + case_dir = case["name"] + dst_shape = case["dst_shape"] + dst_valid_shape = case["dst_valid_shape"] + dtype = case["dtype"] + + vr, vc = dst_valid_shape + + golden = np.fromfile(os.path.join(case_dir, "golden.bin"), dtype=dtype).reshape(dst_shape) + output = np.fromfile(os.path.join(case_dir, "output.bin"), dtype=dtype).reshape(dst_shape) + + ok = result_cmp(golden[:vr, :vc], output[:vr, :vc], case["eps"]) + if ok: + print(style_pass(f"[INFO] {case['name']}: compare passed")) + else: + print(style_fail(f"[ERROR] {case['name']}: compare failed")) + all_passed = False + + if not all_passed: + sys.exit(2) + print(style_pass("[INFO] all cases passed")) + + +if __name__ == "__main__": + main() diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpanddiv/gen_data.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpanddiv/gen_data.py new file mode 100644 index 000000000..504df0a3d --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpanddiv/gen_data.py @@ -0,0 +1,77 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +"""Generate input and golden data for trowexpanddiv ST test cases. + +trowexpanddiv: dst = src0 / broadcast(src1) +""" + +import numpy as np +from cases import CASES +from st_common import setup_case_rng, save_case_data + +# Inline validation for multi-input format (trowexpanddiv uses src0/src1/dst) +REQUIRED_KEYS = {"name", "dtype", "src0_shape", "src0_valid_shape", "src1_shape", + "src1_valid_shape", "dst_shape", "dst_valid_shape"} +for i, case in enumerate(CASES): + missing = REQUIRED_KEYS - case.keys() + if missing: + raise ValueError(f"cases[{i}] ({case.get('name', '?')}) missing keys: {missing}") + +for case in CASES: + setup_case_rng(case) + + dtype = case["dtype"] + src0_shape = case["src0_shape"] + src0_valid_shape = case["src0_valid_shape"] + src1_shape = case["src1_shape"] + src1_valid_shape = case["src1_valid_shape"] + dst_shape = case["dst_shape"] + dst_valid_shape = case["dst_valid_shape"] + + input1 = np.random.randint(1, 10, size=src0_shape).astype(dtype) + input2 = np.random.randint(1, 10, size=src1_shape).astype(dtype) + + golden = np.zeros(dst_shape, dtype=dtype) + dst_vr, dst_vc = dst_valid_shape + src0_vr, src0_vc = src0_valid_shape + src1_vr, src1_vc = src1_valid_shape + + # Compute golden based on src1Col semantics + # src1Col=1: broadcast single column to all dst columns + # src1Col>1: each src1 column broadcasts to dst_vc/src1_vc columns + if dtype in (np.int8, np.int16, np.int32): + if src1_vc == 1: + golden[:dst_vr, :dst_vc] = ( + input1[:src0_vr, :src0_vc] // input2[:src1_vr, 0:1] + ).astype(dtype, copy=False) + else: + # src1Col > 1: each src1 column broadcasts to dst_vc/src1_vc dst columns + block_size = dst_vc // src1_vc + for c in range(src1_vc): + golden[:dst_vr, c*block_size:(c+1)*block_size] = ( + input1[:src0_vr, c*block_size:(c+1)*block_size] // input2[:src1_vr, c:c+1] + ).astype(dtype, copy=False) + else: + if src1_vc == 1: + golden[:dst_vr, :dst_vc] = ( + input1[:src0_vr, :src0_vc] / input2[:src1_vr, 0:1] + ).astype(dtype, copy=False) + else: + # src1Col > 1: each src1 column broadcasts to dst_vc/src1_vc dst columns + block_size = dst_vc // src1_vc + for c in range(src1_vc): + golden[:dst_vr, c*block_size:(c+1)*block_size] = ( + input1[:src0_vr, c*block_size:(c+1)*block_size] / input2[:src1_vr, c:c+1] + ).astype(dtype, copy=False) + + save_case_data(case["name"], {"input1": input1, "input2": input2, "golden": golden}) + print(f"[INFO] gen_data: {case['name']} dtype={dtype.__name__}") diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpanddiv/launch.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpanddiv/launch.cpp new file mode 100644 index 000000000..db0bb00bd --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpanddiv/launch.cpp @@ -0,0 +1,28 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +#include + +#ifndef AICORE +#define AICORE [aicore] +#endif + +// f32 kernels + +extern "C" __global__ AICORE void TROWEXPANDDIV_f32_40x32_hp(__gm__ float *src0, __gm__ float *src1, __gm__ float *dst); +extern "C" __global__ AICORE void TROWEXPANDDIV_f16_16x32(__gm__ uint16_t *src0, __gm__ uint16_t *src1, __gm__ uint16_t *dst); + +void LaunchTROWEXPANDDIV_f32_40x32_hp(float *src0, float *src1, float *dst, void *stream) { + TROWEXPANDDIV_f32_40x32_hp<<<1, nullptr, stream>>>((__gm__ float *)src0, (__gm__ float *)src1, (__gm__ float *)dst); +} + + + +void LaunchTROWEXPANDDIV_f16_16x32(void *src0, void *src1, void *dst, void *stream) { + TROWEXPANDDIV_f16_16x32<<<1, nullptr, stream>>>((__gm__ uint16_t *)src0, (__gm__ uint16_t *)src1, (__gm__ uint16_t *)dst); +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpanddiv/main.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpanddiv/main.cpp new file mode 100644 index 000000000..379d76c40 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpanddiv/main.cpp @@ -0,0 +1,121 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// Host driver for TileLang trowexpanddiv ST — row-wise broadcast division. +// Supports f32, f16 +// Div variants: src1Col=1 (broadcast single value) or src1Col>1 (block broadcast) + +#include "acl/acl.h" +#include "test_common.h" +#include +#include +#include +#include +#include + +using namespace PtoTestCommon; + +// f32 kernels +void LaunchTROWEXPANDDIV_f32_16x256(float *src0, float *src1, float *dst, void *stream); +void LaunchTROWEXPANDDIV_f32_40x32_hp(float *src0, float *src1, float *dst, void *stream); +// f16 kernels (use void* for aclFloat16) +void LaunchTROWEXPANDDIV_f16_16x32(void *src0, void *src1, void *dst, void *stream); +void LaunchTROWEXPANDDIV_f16_32x512(void *src0, void *src1, void *dst, void *stream); +void LaunchTROWEXPANDDIV_f16_16x128_hp(void *src0, void *src1, void *dst, void *stream); + +using LaunchFn = void (*)(void *, void *, void *, void *); + +struct TestCase { + const char *name; + LaunchFn launch; + size_t src0Rows, src0Cols, src1Rows, src1Cols, dstRows, dstCols; + size_t dstValidRows, dstValidCols; + size_t elemSize; +}; + +static const TestCase kCases[] = { + // f32 cases +{"f16_16x32", LaunchTROWEXPANDDIV_f16_16x32, 16, 32, 16, 16, 16, 32, 16, 32, sizeof(uint16_t)}, +{"f32_40x32_hp", (LaunchFn)LaunchTROWEXPANDDIV_f32_40x32_hp, 40, 32, 40, 8, 40, 32, 40, 32, sizeof(float)}, +}; +static constexpr size_t kNumCases = sizeof(kCases) / sizeof(kCases[0]); + +static int RunCase(const TestCase &tc, int deviceId, aclrtStream stream) { + int rc = 0; + size_t src0FileSize = tc.src0Rows * tc.src0Cols * tc.elemSize; + size_t src1FileSize = tc.src1Rows * tc.src1Cols * tc.elemSize; + const size_t dstFileSize = tc.dstRows * tc.dstCols * tc.elemSize; + + std::printf("[INFO] === case: %s (src0=%zux%zu, src1=%zux%zu, dst=%zux%zu) ===\n", + tc.name, tc.src0Rows, tc.src0Cols, tc.src1Rows, tc.src1Cols, tc.dstRows, tc.dstCols); + + std::string caseDir = std::string("./") + tc.name; + + void *src0Host = nullptr, *src1Host = nullptr, *dstHost = nullptr; + void *src0Device = nullptr, *src1Device = nullptr, *dstDevice = nullptr; + + aclrtMallocHost((void **)(&src0Host), src0FileSize); + aclrtMallocHost((void **)(&src1Host), src1FileSize); + aclrtMallocHost((void **)(&dstHost), dstFileSize); + aclrtMalloc((void **)&src0Device, src0FileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc((void **)&src1Device, src1FileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc((void **)&dstDevice, dstFileSize, ACL_MEM_MALLOC_HUGE_FIRST); + + if (!ReadFile((caseDir + "/input1.bin").c_str(), src0FileSize, src0Host, src0FileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input1.bin\n", caseDir.c_str()); + rc = 1; + } + if (rc == 0 && !ReadFile((caseDir + "/input2.bin").c_str(), src1FileSize, src1Host, src1FileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input2.bin\n", caseDir.c_str()); + rc = 1; + } + + if (rc == 0) { + aclrtMemcpy(src0Device, src0FileSize, src0Host, src0FileSize, ACL_MEMCPY_HOST_TO_DEVICE); + aclrtMemcpy(src1Device, src1FileSize, src1Host, src1FileSize, ACL_MEMCPY_HOST_TO_DEVICE); + tc.launch(src0Device, src1Device, dstDevice, stream); + aclrtSynchronizeStream(stream); + aclrtMemcpy(dstHost, dstFileSize, dstDevice, dstFileSize, ACL_MEMCPY_DEVICE_TO_HOST); + } + + if (rc == 0 && !WriteFile((caseDir + "/output.bin").c_str(), dstHost, dstFileSize)) { + std::fprintf(stderr, "[ERROR] failed to write %s/output.bin\n", caseDir.c_str()); + rc = 1; + } + + if (src0Device) aclrtFree(src0Device); + if (src1Device) aclrtFree(src1Device); + if (dstDevice) aclrtFree(dstDevice); + if (src0Host) aclrtFreeHost(src0Host); + if (src1Host) aclrtFreeHost(src1Host); + if (dstHost) aclrtFreeHost(dstHost); + + if (rc == 0) std::printf("[INFO] case %s done\n", tc.name); + return rc; +} + +int main(int argc, char *argv[]) { + const char *caseFilter = (argc > 1) ? argv[1] : nullptr; + int rc = 0, deviceId = 0; + aclrtStream stream = nullptr; + + aclInit(nullptr); + if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) deviceId = std::atoi(envDevice); + aclrtSetDevice(deviceId); + aclrtCreateStream(&stream); + + for (size_t i = 0; i < kNumCases; ++i) { + if (caseFilter && std::strcmp(kCases[i].name, caseFilter) != 0) continue; + if (RunCase(kCases[i], deviceId, stream) != 0) { rc = 1; break; } + } + + if (stream) aclrtDestroyStream(stream); + aclrtResetDevice(deviceId); + aclFinalize(); + return rc; +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpanddiv/trowexpanddiv.pto b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpanddiv/trowexpanddiv.pto new file mode 100644 index 000000000..d76b1e681 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpanddiv/trowexpanddiv.pto @@ -0,0 +1,126 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You can not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// TileLang ST kernels for pto.trowexpanddiv: row-wise broadcast division. +// Supports f32, f16 types. +// src1Col=1: broadcast single column value to all dst columns +// src1Col>1: each src1 column broadcasts to dstCol/src1Col columns + +module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { + // f32_40x64: launchTRowExpandDiv + func.func @TROWEXPANDDIV_f16_16x32(%src0_ptr: !pto.ptr, %src1_ptr: !pto.ptr, %dst_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c16 = arith.constant 16 : index + %c32 = arith.constant 32 : index + %c256 = arith.constant 256 : index + %c512 = arith.constant 512 : index + + %src0_view = pto.make_tensor_view %src0_ptr, + shape = [%c1, %c1, %c1, %c16, %c32], + strides = [%c512, %c512, %c512, %c32, %c1] + : !pto.tensor_view<1x1x1x16x32xf16> + %src1_view = pto.make_tensor_view %src1_ptr, + shape = [%c1, %c1, %c1, %c16, %c16], + strides = [%c256, %c256, %c256, %c16, %c1] + : !pto.tensor_view<1x1x1x16x16xf16> + %dst_view = pto.make_tensor_view %dst_ptr, + shape = [%c1, %c1, %c1, %c16, %c32], + strides = [%c512, %c512, %c512, %c32, %c1] + : !pto.tensor_view<1x1x1x16x32xf16> + + %src0_part = pto.partition_view %src0_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c16, %c32] + : !pto.tensor_view<1x1x1x16x32xf16> -> !pto.partition_tensor_view<1x1x1x16x32xf16> + %src1_part = pto.partition_view %src1_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c16, %c16] + : !pto.tensor_view<1x1x1x16x16xf16> -> !pto.partition_tensor_view<1x1x1x16x16xf16> + %dst_part = pto.partition_view %dst_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c16, %c32] + : !pto.tensor_view<1x1x1x16x32xf16> -> !pto.partition_tensor_view<1x1x1x16x32xf16> + + %src0 = pto.alloc_tile + : !pto.tile_buf + %src1 = pto.alloc_tile + : !pto.tile_buf + %dst = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%src0_part : !pto.partition_tensor_view<1x1x1x16x32xf16>) + outs(%src0 : !pto.tile_buf) + pto.tload ins(%src1_part : !pto.partition_tensor_view<1x1x1x16x16xf16>) + outs(%src1 : !pto.tile_buf) + + pto.trowexpanddiv ins(%src0, %src1 : !pto.tile_buf, !pto.tile_buf) outs(%dst : !pto.tile_buf) {src1Col = 1 : i64, src0eqdst = true, highPrecision = false} + + pto.tstore ins(%dst : !pto.tile_buf) + outs(%dst_part : !pto.partition_tensor_view<1x1x1x16x32xf16>) + return + } + + // f16_32x512: launchTRowExpandDiv + + func.func @TROWEXPANDDIV_f32_40x32_hp(%src0_ptr: !pto.ptr, %src1_ptr: !pto.ptr, %dst_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c8 = arith.constant 8 : index + %c32 = arith.constant 32 : index + %c40 = arith.constant 40 : index + %c320 = arith.constant 320 : index + %c1280 = arith.constant 1280 : index + + %src0_view = pto.make_tensor_view %src0_ptr, + shape = [%c1, %c1, %c1, %c40, %c32], + strides = [%c1280, %c1280, %c1280, %c32, %c1] + : !pto.tensor_view<1x1x1x40x32xf32> + %src1_view = pto.make_tensor_view %src1_ptr, + shape = [%c1, %c1, %c1, %c40, %c8], + strides = [%c320, %c320, %c320, %c8, %c1] + : !pto.tensor_view<1x1x1x40x8xf32> + %dst_view = pto.make_tensor_view %dst_ptr, + shape = [%c1, %c1, %c1, %c40, %c32], + strides = [%c1280, %c1280, %c1280, %c32, %c1] + : !pto.tensor_view<1x1x1x40x32xf32> + + %src0_part = pto.partition_view %src0_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c40, %c32] + : !pto.tensor_view<1x1x1x40x32xf32> -> !pto.partition_tensor_view<1x1x1x40x32xf32> + %src1_part = pto.partition_view %src1_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c40, %c8] + : !pto.tensor_view<1x1x1x40x8xf32> -> !pto.partition_tensor_view<1x1x1x40x8xf32> + %dst_part = pto.partition_view %dst_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c40, %c32] + : !pto.tensor_view<1x1x1x40x32xf32> -> !pto.partition_tensor_view<1x1x1x40x32xf32> + + %src0 = pto.alloc_tile + : !pto.tile_buf + %src1 = pto.alloc_tile + : !pto.tile_buf + %dst = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%src0_part : !pto.partition_tensor_view<1x1x1x40x32xf32>) + outs(%src0 : !pto.tile_buf) + pto.tload ins(%src1_part : !pto.partition_tensor_view<1x1x1x40x8xf32>) + outs(%src1 : !pto.tile_buf) + + pto.trowexpanddiv ins(%src0, %src1 : !pto.tile_buf, !pto.tile_buf) outs(%dst : !pto.tile_buf) {src1Col = 1 : i64, src0eqdst = true, highPrecision = true} + + pto.tstore ins(%dst : !pto.tile_buf) + outs(%dst_part : !pto.partition_tensor_view<1x1x1x40x32xf32>) + return + } + + // f16_16x128_hp: launchTRowExpandDiv (highPrecision) +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandexpdif/CMakeLists.txt b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandexpdif/CMakeLists.txt new file mode 100644 index 000000000..adaec0668 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandexpdif/CMakeLists.txt @@ -0,0 +1,9 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +pto_tilelang_vec_st(trowexpandexpdif) diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandexpdif/cases.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandexpdif/cases.py new file mode 100644 index 000000000..d0cfa0a88 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandexpdif/cases.py @@ -0,0 +1,94 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +"""Single source of truth for trowexpandexpdif ST test cases. + +trowexpandexpdif: dst = exp(src0 - broadcast(src1)) +- src1Col=1: only first column of src1 is valid, broadcast to dstCols +- src1Col>1: launchTRowExpandExpdif2 with different semantics (TBD) +- src1 physical cols = 32/sizeof(dtype) for NPU alignment +- src0eqdst: true means src0 shape equals dst shape +""" + +import numpy as np + +CASES = [ + # launchTRowExpandExpdif + { + "name": "f32_16x32", + "dtype": np.float32, + "src0_shape": (16, 32), + "src0_valid_shape": (16, 32), + "src1_shape": (16, 8), + "src1_valid_shape": (16, 1), + "dst_shape": (16, 32), + "dst_valid_shape": (16, 32), + "eps": 1e-5, + }, + # launchTRowExpandExpdif + { + "name": "f32_32x64", + "dtype": np.float32, + "src0_shape": (32, 64), + "src0_valid_shape": (32, 64), + "src1_shape": (32, 8), # physical: 32/sizeof(f32)=8 + "src1_valid_shape": (32, 1), # src1Col=1 + "dst_shape": (32, 64), + "dst_valid_shape": (32, 64), + "eps": 1e-5, + }, + # launchTRowExpandExpdif + { + "name": "f16_16x32", + "dtype": np.float16, + "src0_shape": (16, 32), + "src0_valid_shape": (16, 32), + "src1_shape": (16, 16), # physical: 32/sizeof(f16)=16 + "src1_valid_shape": (16, 1), + "dst_shape": (16, 32), + "dst_valid_shape": (16, 32), + "eps": 1e-3, + }, + # launchTRowExpandExpdif + { + "name": "f16_48x64", + "dtype": np.float16, + "src0_shape": (48, 64), + "src0_valid_shape": (48, 64), + "src1_shape": (48, 16), + "src1_valid_shape": (48, 1), + "dst_shape": (48, 64), + "dst_valid_shape": (48, 64), + "eps": 1e-3, + }, + # launchTRowExpandExpdif + { + "name": "f32_16x128_noeq", + "dtype": np.float32, + "src0_shape": (16, 128), # src0eqdst=false + "src0_valid_shape": (16, 128), + "src1_shape": (16, 8), + "src1_valid_shape": (16, 1), + "dst_shape": (16, 128), + "dst_valid_shape": (16, 128), + "eps": 1e-5, + }, + # Note: launchTRowExpandExpdif2 with src1Col>1 has different semantics - TBD + # - float, 24, 64, 24, 8, true (src1Col=8) + # - aclFloat16, 16, 64, 16, 16, false (src1Col=16) +] + +_SMOKE_CASE_NAMES = ['f32_16x32', 'f16_16x32'] +_SMOKE_CASE_NAME_SET = set(_SMOKE_CASE_NAMES) +_missing = [name for name in _SMOKE_CASE_NAMES if name not in {case["name"] for case in CASES}] +if _missing: + raise RuntimeError("unknown smoke case(s): " + ", ".join(_missing)) +CASES = [case for case in CASES if case["name"] in _SMOKE_CASE_NAME_SET] diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandexpdif/compare.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandexpdif/compare.py new file mode 100644 index 000000000..982658309 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandexpdif/compare.py @@ -0,0 +1,61 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +"""Compare golden and output for trowexpandexpdif ST test cases.""" + +import os +import sys +import numpy as np + +from cases import CASES +from st_common import result_cmp, style_fail, style_pass + +# Inline validation for multi-input format (trowexpandexpdif uses src0/src1/dst) +REQUIRED_KEYS = {"name", "dtype", "src0_shape", "src0_valid_shape", "src1_shape", + "src1_valid_shape", "dst_shape", "dst_valid_shape"} +for i, case in enumerate(CASES): + missing = REQUIRED_KEYS - case.keys() + if missing: + raise ValueError(f"cases[{i}] ({case.get('name', '?')}) missing keys: {missing}") + + +def main(): + case_filter = sys.argv[1] if len(sys.argv) > 1 else None + + all_passed = True + for case in CASES: + if case_filter is not None and case["name"] != case_filter: + continue + + case_dir = case["name"] + dst_shape = case["dst_shape"] + dst_valid_shape = case["dst_valid_shape"] + dtype = case["dtype"] + + vr, vc = dst_valid_shape + + golden = np.fromfile(os.path.join(case_dir, "golden.bin"), dtype=dtype).reshape(dst_shape) + output = np.fromfile(os.path.join(case_dir, "output.bin"), dtype=dtype).reshape(dst_shape) + + ok = result_cmp(golden[:vr, :vc], output[:vr, :vc], case["eps"]) + if ok: + print(style_pass(f"[INFO] {case['name']}: compare passed")) + else: + print(style_fail(f"[ERROR] {case['name']}: compare failed")) + all_passed = False + + if not all_passed: + sys.exit(2) + print(style_pass("[INFO] all cases passed")) + + +if __name__ == "__main__": + main() diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandexpdif/gen_data.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandexpdif/gen_data.py new file mode 100644 index 000000000..a58fa394a --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandexpdif/gen_data.py @@ -0,0 +1,54 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +"""Generate input and golden data for trowexpandexpdif ST test cases. + +trowexpandexpdif: dst = exp(src0 - broadcast(src1)) +""" + +import numpy as np +from cases import CASES +from st_common import setup_case_rng, save_case_data + +# Inline validation for multi-input format (trowexpandexpdif uses src0/src1/dst) +REQUIRED_KEYS = {"name", "dtype", "src0_shape", "src0_valid_shape", "src1_shape", + "src1_valid_shape", "dst_shape", "dst_valid_shape"} +for i, case in enumerate(CASES): + missing = REQUIRED_KEYS - case.keys() + if missing: + raise ValueError(f"cases[{i}] ({case.get('name', '?')}) missing keys: {missing}") + +for case in CASES: + setup_case_rng(case) + + dtype = case["dtype"] + src0_shape = case["src0_shape"] + src0_valid_shape = case["src0_valid_shape"] + src1_shape = case["src1_shape"] + src1_valid_shape = case["src1_valid_shape"] + dst_shape = case["dst_shape"] + dst_valid_shape = case["dst_valid_shape"] + + # Use small values to avoid overflow in exp + input1 = np.random.randint(1, 5, size=src0_shape).astype(dtype) + input2 = np.random.randint(1, 5, size=src1_shape).astype(dtype) + + golden = np.zeros(dst_shape, dtype=dtype) + dst_vr, dst_vc = dst_valid_shape + src0_vr, src0_vc = src0_valid_shape + src1_vr = src1_valid_shape[0] + + # exp(src0 - src1_scalar) + diff = input1[:src0_vr, :src0_vc] - input2[:src1_vr, 0:1] + golden[:dst_vr, :dst_vc] = np.exp(diff).astype(dtype, copy=False) + + save_case_data(case["name"], {"input1": input1, "input2": input2, "golden": golden}) + print(f"[INFO] gen_data: {case['name']} dtype={dtype.__name__}") diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandexpdif/launch.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandexpdif/launch.cpp new file mode 100644 index 000000000..b33a3640d --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandexpdif/launch.cpp @@ -0,0 +1,27 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +#include + +#ifndef AICORE +#define AICORE [aicore] +#endif + +// f32 kernels + +extern "C" __global__ AICORE void TROWEXPANDEXPDIF_f32_16x32(__gm__ float *src0, __gm__ float *src1, __gm__ float *dst); +extern "C" __global__ AICORE void TROWEXPANDEXPDIF_f16_16x32(__gm__ uint16_t *src0, __gm__ uint16_t *src1, __gm__ uint16_t *dst); + +void LaunchTROWEXPANDEXPDIF_f16_16x32(void *src0, void *src1, void *dst, void *stream) { + TROWEXPANDEXPDIF_f16_16x32<<<1, nullptr, stream>>>((__gm__ uint16_t *)src0, (__gm__ uint16_t *)src1, (__gm__ uint16_t *)dst); +} + + +void LaunchTROWEXPANDEXPDIF_f32_16x32(float *src0, float *src1, float *dst, void *stream) { + TROWEXPANDEXPDIF_f32_16x32<<<1, nullptr, stream>>>((__gm__ float *)src0, (__gm__ float *)src1, (__gm__ float *)dst); +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandexpdif/main.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandexpdif/main.cpp new file mode 100644 index 000000000..948a790fe --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandexpdif/main.cpp @@ -0,0 +1,120 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// Host driver for TileLang trowexpandexpdif ST — row-wise broadcast exponential difference. +// Supports f32, f16 + +#include "acl/acl.h" +#include "test_common.h" +#include +#include +#include +#include +#include + +using namespace PtoTestCommon; + +// f32 kernels +void LaunchTROWEXPANDEXPDIF_f32_16x32(float *src0, float *src1, float *dst, void *stream); +// f16 kernels (use void* for aclFloat16) +void LaunchTROWEXPANDEXPDIF_f16_16x32(void *src0, void *src1, void *dst, void *stream); +void LaunchTROWEXPANDEXPDIF_f16_48x64(void *src0, void *src1, void *dst, void *stream); + +// Note: launchTRowExpandExpdif2 with src1Col>1 has different semantics - TBD + +using LaunchFn = void (*)(void *, void *, void *, void *); + +struct TestCase { + const char *name; + LaunchFn launch; + size_t src0Rows, src0Cols, src1Rows, src1Cols, dstRows, dstCols; + size_t dstValidRows, dstValidCols; + size_t elemSize; +}; + +static const TestCase kCases[] = { + // f32 cases +{"f32_16x32", (LaunchFn)LaunchTROWEXPANDEXPDIF_f32_16x32, 16, 32, 16, 8, 16, 32, 16, 32, sizeof(float)}, +{"f16_16x32", LaunchTROWEXPANDEXPDIF_f16_16x32, 16, 32, 16, 16, 16, 32, 16, 32, sizeof(uint16_t)}, +}; +static constexpr size_t kNumCases = sizeof(kCases) / sizeof(kCases[0]); + +static int RunCase(const TestCase &tc, int deviceId, aclrtStream stream) { + int rc = 0; + size_t src0FileSize = tc.src0Rows * tc.src0Cols * tc.elemSize; + size_t src1FileSize = tc.src1Rows * tc.src1Cols * tc.elemSize; + const size_t dstFileSize = tc.dstRows * tc.dstCols * tc.elemSize; + + std::printf("[INFO] === case: %s (src0=%zux%zu, src1=%zux%zu, dst=%zux%zu) ===\n", + tc.name, tc.src0Rows, tc.src0Cols, tc.src1Rows, tc.src1Cols, tc.dstRows, tc.dstCols); + + std::string caseDir = std::string("./") + tc.name; + + void *src0Host = nullptr, *src1Host = nullptr, *dstHost = nullptr; + void *src0Device = nullptr, *src1Device = nullptr, *dstDevice = nullptr; + + aclrtMallocHost((void **)(&src0Host), src0FileSize); + aclrtMallocHost((void **)(&src1Host), src1FileSize); + aclrtMallocHost((void **)(&dstHost), dstFileSize); + aclrtMalloc((void **)&src0Device, src0FileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc((void **)&src1Device, src1FileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc((void **)&dstDevice, dstFileSize, ACL_MEM_MALLOC_HUGE_FIRST); + + if (!ReadFile((caseDir + "/input1.bin").c_str(), src0FileSize, src0Host, src0FileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input1.bin\n", caseDir.c_str()); + rc = 1; + } + if (rc == 0 && !ReadFile((caseDir + "/input2.bin").c_str(), src1FileSize, src1Host, src1FileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input2.bin\n", caseDir.c_str()); + rc = 1; + } + + if (rc == 0) { + aclrtMemcpy(src0Device, src0FileSize, src0Host, src0FileSize, ACL_MEMCPY_HOST_TO_DEVICE); + aclrtMemcpy(src1Device, src1FileSize, src1Host, src1FileSize, ACL_MEMCPY_HOST_TO_DEVICE); + tc.launch(src0Device, src1Device, dstDevice, stream); + aclrtSynchronizeStream(stream); + aclrtMemcpy(dstHost, dstFileSize, dstDevice, dstFileSize, ACL_MEMCPY_DEVICE_TO_HOST); + } + + if (rc == 0 && !WriteFile((caseDir + "/output.bin").c_str(), dstHost, dstFileSize)) { + std::fprintf(stderr, "[ERROR] failed to write %s/output.bin\n", caseDir.c_str()); + rc = 1; + } + + if (src0Device) aclrtFree(src0Device); + if (src1Device) aclrtFree(src1Device); + if (dstDevice) aclrtFree(dstDevice); + if (src0Host) aclrtFreeHost(src0Host); + if (src1Host) aclrtFreeHost(src1Host); + if (dstHost) aclrtFreeHost(dstHost); + + if (rc == 0) std::printf("[INFO] case %s done\n", tc.name); + return rc; +} + +int main(int argc, char *argv[]) { + const char *caseFilter = (argc > 1) ? argv[1] : nullptr; + int rc = 0, deviceId = 0; + aclrtStream stream = nullptr; + + aclInit(nullptr); + if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) deviceId = std::atoi(envDevice); + aclrtSetDevice(deviceId); + aclrtCreateStream(&stream); + + for (size_t i = 0; i < kNumCases; ++i) { + if (caseFilter && std::strcmp(kCases[i].name, caseFilter) != 0) continue; + if (RunCase(kCases[i], deviceId, stream) != 0) { rc = 1; break; } + } + + if (stream) aclrtDestroyStream(stream); + aclrtResetDevice(deviceId); + aclFinalize(); + return rc; +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandexpdif/trowexpandexpdif.pto b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandexpdif/trowexpandexpdif.pto new file mode 100644 index 000000000..a0ed5a461 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandexpdif/trowexpandexpdif.pto @@ -0,0 +1,125 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You can not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// TileLang ST kernels for pto.trowexpandexpdif: row-wise broadcast exponential difference. +// dst = exp(src0 - broadcast(src1)) +// Supports f32, f16 + +module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { + // f32_32x64: launchTRowExpandExpdif + func.func @TROWEXPANDEXPDIF_f32_16x32(%src0_ptr: !pto.ptr, %src1_ptr: !pto.ptr, %dst_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c8 = arith.constant 8 : index + %c16 = arith.constant 16 : index + %c32 = arith.constant 32 : index + %c128 = arith.constant 128 : index + %c512 = arith.constant 512 : index + + %src0_view = pto.make_tensor_view %src0_ptr, + shape = [%c1, %c1, %c1, %c16, %c32], + strides = [%c512, %c512, %c512, %c32, %c1] + : !pto.tensor_view<1x1x1x16x32xf32> + %src1_view = pto.make_tensor_view %src1_ptr, + shape = [%c1, %c1, %c1, %c16, %c8], + strides = [%c128, %c128, %c128, %c8, %c1] + : !pto.tensor_view<1x1x1x16x8xf32> + %dst_view = pto.make_tensor_view %dst_ptr, + shape = [%c1, %c1, %c1, %c16, %c32], + strides = [%c512, %c512, %c512, %c32, %c1] + : !pto.tensor_view<1x1x1x16x32xf32> + + %src0_part = pto.partition_view %src0_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c16, %c32] + : !pto.tensor_view<1x1x1x16x32xf32> -> !pto.partition_tensor_view<1x1x1x16x32xf32> + %src1_part = pto.partition_view %src1_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c16, %c8] + : !pto.tensor_view<1x1x1x16x8xf32> -> !pto.partition_tensor_view<1x1x1x16x8xf32> + %dst_part = pto.partition_view %dst_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c16, %c32] + : !pto.tensor_view<1x1x1x16x32xf32> -> !pto.partition_tensor_view<1x1x1x16x32xf32> + + %src0 = pto.alloc_tile + : !pto.tile_buf + %src1 = pto.alloc_tile + : !pto.tile_buf + %dst = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%src0_part : !pto.partition_tensor_view<1x1x1x16x32xf32>) + outs(%src0 : !pto.tile_buf) + pto.tload ins(%src1_part : !pto.partition_tensor_view<1x1x1x16x8xf32>) + outs(%src1 : !pto.tile_buf) + + pto.trowexpandexpdif ins(%src0, %src1 : !pto.tile_buf, !pto.tile_buf) outs(%dst : !pto.tile_buf) {src1Col = 1 : i64, src0eqdst = true} + + pto.tstore ins(%dst : !pto.tile_buf) + outs(%dst_part : !pto.partition_tensor_view<1x1x1x16x32xf32>) + return + } + + // f16_16x32: launchTRowExpandExpdif + + func.func @TROWEXPANDEXPDIF_f16_16x32(%src0_ptr: !pto.ptr, %src1_ptr: !pto.ptr, %dst_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c16 = arith.constant 16 : index + %c32 = arith.constant 32 : index + %c256 = arith.constant 256 : index + %c512 = arith.constant 512 : index + + %src0_view = pto.make_tensor_view %src0_ptr, + shape = [%c1, %c1, %c1, %c16, %c32], + strides = [%c512, %c512, %c512, %c32, %c1] + : !pto.tensor_view<1x1x1x16x32xf16> + %src1_view = pto.make_tensor_view %src1_ptr, + shape = [%c1, %c1, %c1, %c16, %c16], + strides = [%c256, %c256, %c256, %c16, %c1] + : !pto.tensor_view<1x1x1x16x16xf16> + %dst_view = pto.make_tensor_view %dst_ptr, + shape = [%c1, %c1, %c1, %c16, %c32], + strides = [%c512, %c512, %c512, %c32, %c1] + : !pto.tensor_view<1x1x1x16x32xf16> + + %src0_part = pto.partition_view %src0_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c16, %c32] + : !pto.tensor_view<1x1x1x16x32xf16> -> !pto.partition_tensor_view<1x1x1x16x32xf16> + %src1_part = pto.partition_view %src1_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c16, %c16] + : !pto.tensor_view<1x1x1x16x16xf16> -> !pto.partition_tensor_view<1x1x1x16x16xf16> + %dst_part = pto.partition_view %dst_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c16, %c32] + : !pto.tensor_view<1x1x1x16x32xf16> -> !pto.partition_tensor_view<1x1x1x16x32xf16> + + %src0 = pto.alloc_tile + : !pto.tile_buf + %src1 = pto.alloc_tile + : !pto.tile_buf + %dst = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%src0_part : !pto.partition_tensor_view<1x1x1x16x32xf16>) + outs(%src0 : !pto.tile_buf) + pto.tload ins(%src1_part : !pto.partition_tensor_view<1x1x1x16x16xf16>) + outs(%src1 : !pto.tile_buf) + + pto.trowexpandexpdif ins(%src0, %src1 : !pto.tile_buf, !pto.tile_buf) outs(%dst : !pto.tile_buf) {src1Col = 1 : i64, src0eqdst = true} + + pto.tstore ins(%dst : !pto.tile_buf) + outs(%dst_part : !pto.partition_tensor_view<1x1x1x16x32xf16>) + return + } + + // f16_48x64: launchTRowExpandExpdif +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandmax/CMakeLists.txt b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandmax/CMakeLists.txt new file mode 100644 index 000000000..1dca5ab0e --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandmax/CMakeLists.txt @@ -0,0 +1,9 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +pto_tilelang_vec_st(trowexpandmax) diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandmax/cases.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandmax/cases.py new file mode 100644 index 000000000..942cad045 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandmax/cases.py @@ -0,0 +1,118 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +"""Single source of truth for trowexpandmax ST test cases. + +trowexpandmax: row-wise broadcast maximum. +- src1Col=1: only first column of src1 is valid, broadcast to dstCols +- src1Col>1: launchTRowExpandMax2 with different semantics (TBD) +- src1 physical cols = 32/sizeof(dtype) for NPU alignment +- src0eqdst: true means src0 shape equals dst shape +""" + +import numpy as np + +CASES = [ + # launchTRowExpandMax + { + "name": "f32_16x32", + "dtype": np.float32, + "src0_shape": (16, 32), + "src0_valid_shape": (16, 32), + "src1_shape": (16, 8), # physical: 32/sizeof(f32)=8 + "src1_valid_shape": (16, 1), # src1Col=1 + "dst_shape": (16, 32), + "dst_valid_shape": (16, 32), + "eps": 1e-6, + }, + # launchTRowExpandMax + { + "name": "f32_56x128", + "dtype": np.float32, + "src0_shape": (56, 128), + "src0_valid_shape": (56, 128), + "src1_shape": (56, 8), + "src1_valid_shape": (56, 1), + "dst_shape": (56, 128), + "dst_valid_shape": (56, 128), + "eps": 1e-6, + }, + # launchTRowExpandMax + { + "name": "f16_48x64", + "dtype": np.float16, + "src0_shape": (48, 64), + "src0_valid_shape": (48, 64), + "src1_shape": (48, 16), # physical: 32/sizeof(f16)=16 + "src1_valid_shape": (48, 1), + "dst_shape": (48, 64), + "dst_valid_shape": (48, 64), + "eps": 1e-3, + }, + # launchTRowExpandMax + { + "name": "f16_16x128", + "dtype": np.float16, + "src0_shape": (16, 128), + "src0_valid_shape": (16, 128), + "src1_shape": (16, 16), + "src1_valid_shape": (16, 1), + "dst_shape": (16, 128), + "dst_valid_shape": (16, 128), + "eps": 1e-3, + }, + # launchTRowExpandMax + { + "name": "f16_32x64_noeq", + "dtype": np.float16, + "src0_shape": (32, 64), # src0eqdst=false + "src0_valid_shape": (32, 64), + "src1_shape": (32, 16), + "src1_valid_shape": (32, 1), + "dst_shape": (32, 64), + "dst_valid_shape": (32, 64), + "eps": 1e-3, + }, + # launchTRowExpandMax + { + "name": "i32_16x32", + "dtype": np.int32, + "src0_shape": (16, 32), + "src0_valid_shape": (16, 32), + "src1_shape": (16, 8), # physical: 32/sizeof(i32)=8 + "src1_valid_shape": (16, 1), + "dst_shape": (16, 32), + "dst_valid_shape": (16, 32), + "eps": 0, + }, + # launchTRowExpandMax + { + "name": "i16_16x64", + "dtype": np.int16, + "src0_shape": (16, 64), + "src0_valid_shape": (16, 64), + "src1_shape": (16, 16), # physical: 32/sizeof(i16)=16 + "src1_valid_shape": (16, 1), + "dst_shape": (16, 64), + "dst_valid_shape": (16, 64), + "eps": 0, + }, + # Note: launchTRowExpandMax2 with src1Col>1 has different semantics - TBD + # - float, 24, 64, 24, 8, true (src1Col=8) + # - float, 20, 64, 20, 8, false (src1Col=8) +] + +_SMOKE_CASE_NAMES = ['f32_16x32', 'i32_16x32'] +_SMOKE_CASE_NAME_SET = set(_SMOKE_CASE_NAMES) +_missing = [name for name in _SMOKE_CASE_NAMES if name not in {case["name"] for case in CASES}] +if _missing: + raise RuntimeError("unknown smoke case(s): " + ", ".join(_missing)) +CASES = [case for case in CASES if case["name"] in _SMOKE_CASE_NAME_SET] diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandmax/compare.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandmax/compare.py new file mode 100644 index 000000000..6251f7881 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandmax/compare.py @@ -0,0 +1,61 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +"""Compare golden and output for trowexpandmax ST test cases.""" + +import os +import sys +import numpy as np + +from cases import CASES +from st_common import result_cmp, style_fail, style_pass + +# Inline validation for multi-input format (trowexpandmax uses src0/src1/dst) +REQUIRED_KEYS = {"name", "dtype", "src0_shape", "src0_valid_shape", "src1_shape", + "src1_valid_shape", "dst_shape", "dst_valid_shape"} +for i, case in enumerate(CASES): + missing = REQUIRED_KEYS - case.keys() + if missing: + raise ValueError(f"cases[{i}] ({case.get('name', '?')}) missing keys: {missing}") + + +def main(): + case_filter = sys.argv[1] if len(sys.argv) > 1 else None + + all_passed = True + for case in CASES: + if case_filter is not None and case["name"] != case_filter: + continue + + case_dir = case["name"] + dst_shape = case["dst_shape"] + dst_valid_shape = case["dst_valid_shape"] + dtype = case["dtype"] + + vr, vc = dst_valid_shape + + golden = np.fromfile(os.path.join(case_dir, "golden.bin"), dtype=dtype).reshape(dst_shape) + output = np.fromfile(os.path.join(case_dir, "output.bin"), dtype=dtype).reshape(dst_shape) + + ok = result_cmp(golden[:vr, :vc], output[:vr, :vc], case["eps"]) + if ok: + print(style_pass(f"[INFO] {case['name']}: compare passed")) + else: + print(style_fail(f"[ERROR] {case['name']}: compare failed")) + all_passed = False + + if not all_passed: + sys.exit(2) + print(style_pass("[INFO] all cases passed")) + + +if __name__ == "__main__": + main() diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandmax/gen_data.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandmax/gen_data.py new file mode 100644 index 000000000..d22b58d6f --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandmax/gen_data.py @@ -0,0 +1,53 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +"""Generate input and golden data for trowexpandmax ST test cases. + +trowexpandmax: dst = max(src0, broadcast(src1)) +""" + +import numpy as np +from cases import CASES +from st_common import setup_case_rng, save_case_data + +# Inline validation for multi-input format (trowexpandmax uses src0/src1/dst) +REQUIRED_KEYS = {"name", "dtype", "src0_shape", "src0_valid_shape", "src1_shape", + "src1_valid_shape", "dst_shape", "dst_valid_shape"} +for i, case in enumerate(CASES): + missing = REQUIRED_KEYS - case.keys() + if missing: + raise ValueError(f"cases[{i}] ({case.get('name', '?')}) missing keys: {missing}") + +for case in CASES: + setup_case_rng(case) + + dtype = case["dtype"] + src0_shape = case["src0_shape"] + src0_valid_shape = case["src0_valid_shape"] + src1_shape = case["src1_shape"] + src1_valid_shape = case["src1_valid_shape"] + dst_shape = case["dst_shape"] + dst_valid_shape = case["dst_valid_shape"] + + input1 = np.random.randint(1, 10, size=src0_shape).astype(dtype) + input2 = np.random.randint(1, 10, size=src1_shape).astype(dtype) + + golden = np.zeros(dst_shape, dtype=dtype) + dst_vr, dst_vc = dst_valid_shape + src0_vr, src0_vc = src0_valid_shape + src1_vr = src1_valid_shape[0] + + golden[:dst_vr, :dst_vc] = np.maximum( + input1[:src0_vr, :src0_vc], np.broadcast_to(input2[:src1_vr, 0:1], (dst_vr, dst_vc)) + ).astype(dtype, copy=False) + + save_case_data(case["name"], {"input1": input1, "input2": input2, "golden": golden}) + print(f"[INFO] gen_data: {case['name']} dtype={dtype.__name__}") diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandmax/launch.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandmax/launch.cpp new file mode 100644 index 000000000..6243e8ef8 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandmax/launch.cpp @@ -0,0 +1,28 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +#include + +#ifndef AICORE +#define AICORE [aicore] +#endif + +// f32 kernels + +extern "C" __global__ AICORE void TROWEXPANDMAX_f32_16x32(__gm__ float *src0, __gm__ float *src1, __gm__ float *dst); +extern "C" __global__ AICORE void TROWEXPANDMAX_i32_16x32(__gm__ int32_t *src0, __gm__ int32_t *src1, __gm__ int32_t *dst); + +void LaunchTROWEXPANDMAX_i32_16x32(void *src0, void *src1, void *dst, void *stream) { + TROWEXPANDMAX_i32_16x32<<<1, nullptr, stream>>>((__gm__ int32_t *)src0, (__gm__ int32_t *)src1, (__gm__ int32_t *)dst); +} + + + +void LaunchTROWEXPANDMAX_f32_16x32(float *src0, float *src1, float *dst, void *stream) { + TROWEXPANDMAX_f32_16x32<<<1, nullptr, stream>>>((__gm__ float *)src0, (__gm__ float *)src1, (__gm__ float *)dst); +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandmax/main.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandmax/main.cpp new file mode 100644 index 000000000..355173cb6 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandmax/main.cpp @@ -0,0 +1,123 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// Host driver for TileLang trowexpandmax ST — row-wise broadcast maximum. +// Supports f32, f16, i32, i16 + +#include "acl/acl.h" +#include "test_common.h" +#include +#include +#include +#include +#include + +using namespace PtoTestCommon; + +// f32 kernels +void LaunchTROWEXPANDMAX_f32_16x32(float *src0, float *src1, float *dst, void *stream); +void LaunchTROWEXPANDMAX_f32_56x128(float *src0, float *src1, float *dst, void *stream); +// f16 kernels (use void* for aclFloat16) +void LaunchTROWEXPANDMAX_f16_16x128(void *src0, void *src1, void *dst, void *stream); +// i32 kernels +void LaunchTROWEXPANDMAX_i32_16x32(void *src0, void *src1, void *dst, void *stream); +// i16 kernels + +// Note: launchTRowExpandMax2 with src1Col>1 has different semantics - TBD + +using LaunchFn = void (*)(void *, void *, void *, void *); + +struct TestCase { + const char *name; + LaunchFn launch; + size_t src0Rows, src0Cols, src1Rows, src1Cols, dstRows, dstCols; + size_t dstValidRows, dstValidCols; + size_t elemSize; +}; + +static const TestCase kCases[] = { + // f32 cases +{"f32_16x32", (LaunchFn)LaunchTROWEXPANDMAX_f32_16x32, 16, 32, 16, 8, 16, 32, 16, 32, sizeof(float)}, +{"i32_16x32", LaunchTROWEXPANDMAX_i32_16x32, 16, 32, 16, 8, 16, 32, 16, 32, sizeof(int32_t)}, +}; +static constexpr size_t kNumCases = sizeof(kCases) / sizeof(kCases[0]); + +static int RunCase(const TestCase &tc, int deviceId, aclrtStream stream) { + int rc = 0; + size_t src0FileSize = tc.src0Rows * tc.src0Cols * tc.elemSize; + size_t src1FileSize = tc.src1Rows * tc.src1Cols * tc.elemSize; + const size_t dstFileSize = tc.dstRows * tc.dstCols * tc.elemSize; + + std::printf("[INFO] === case: %s (src0=%zux%zu, src1=%zux%zu, dst=%zux%zu) ===\n", + tc.name, tc.src0Rows, tc.src0Cols, tc.src1Rows, tc.src1Cols, tc.dstRows, tc.dstCols); + + std::string caseDir = std::string("./") + tc.name; + + void *src0Host = nullptr, *src1Host = nullptr, *dstHost = nullptr; + void *src0Device = nullptr, *src1Device = nullptr, *dstDevice = nullptr; + + aclrtMallocHost((void **)(&src0Host), src0FileSize); + aclrtMallocHost((void **)(&src1Host), src1FileSize); + aclrtMallocHost((void **)(&dstHost), dstFileSize); + aclrtMalloc((void **)&src0Device, src0FileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc((void **)&src1Device, src1FileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc((void **)&dstDevice, dstFileSize, ACL_MEM_MALLOC_HUGE_FIRST); + + if (!ReadFile((caseDir + "/input1.bin").c_str(), src0FileSize, src0Host, src0FileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input1.bin\n", caseDir.c_str()); + rc = 1; + } + if (rc == 0 && !ReadFile((caseDir + "/input2.bin").c_str(), src1FileSize, src1Host, src1FileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input2.bin\n", caseDir.c_str()); + rc = 1; + } + + if (rc == 0) { + aclrtMemcpy(src0Device, src0FileSize, src0Host, src0FileSize, ACL_MEMCPY_HOST_TO_DEVICE); + aclrtMemcpy(src1Device, src1FileSize, src1Host, src1FileSize, ACL_MEMCPY_HOST_TO_DEVICE); + tc.launch(src0Device, src1Device, dstDevice, stream); + aclrtSynchronizeStream(stream); + aclrtMemcpy(dstHost, dstFileSize, dstDevice, dstFileSize, ACL_MEMCPY_DEVICE_TO_HOST); + } + + if (rc == 0 && !WriteFile((caseDir + "/output.bin").c_str(), dstHost, dstFileSize)) { + std::fprintf(stderr, "[ERROR] failed to write %s/output.bin\n", caseDir.c_str()); + rc = 1; + } + + if (src0Device) aclrtFree(src0Device); + if (src1Device) aclrtFree(src1Device); + if (dstDevice) aclrtFree(dstDevice); + if (src0Host) aclrtFreeHost(src0Host); + if (src1Host) aclrtFreeHost(src1Host); + if (dstHost) aclrtFreeHost(dstHost); + + if (rc == 0) std::printf("[INFO] case %s done\n", tc.name); + return rc; +} + +int main(int argc, char *argv[]) { + const char *caseFilter = (argc > 1) ? argv[1] : nullptr; + int rc = 0, deviceId = 0; + aclrtStream stream = nullptr; + + aclInit(nullptr); + if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) deviceId = std::atoi(envDevice); + aclrtSetDevice(deviceId); + aclrtCreateStream(&stream); + + for (size_t i = 0; i < kNumCases; ++i) { + if (caseFilter && std::strcmp(kCases[i].name, caseFilter) != 0) continue; + if (RunCase(kCases[i], deviceId, stream) != 0) { rc = 1; break; } + } + + if (stream) aclrtDestroyStream(stream); + aclrtResetDevice(deviceId); + aclFinalize(); + return rc; +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandmax/trowexpandmax.pto b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandmax/trowexpandmax.pto new file mode 100644 index 000000000..0ac36b4e6 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandmax/trowexpandmax.pto @@ -0,0 +1,125 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You can not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// TileLang ST kernels for pto.trowexpandmax: row-wise broadcast maximum. +// Supports f32, f16, i32, i16 + +module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { + // f32_16x32: launchTRowExpandMax + func.func @TROWEXPANDMAX_f32_16x32(%src0_ptr: !pto.ptr, %src1_ptr: !pto.ptr, %dst_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c8 = arith.constant 8 : index + %c16 = arith.constant 16 : index + %c32 = arith.constant 32 : index + %c128 = arith.constant 128 : index + %c512 = arith.constant 512 : index + + %src0_view = pto.make_tensor_view %src0_ptr, + shape = [%c1, %c1, %c1, %c16, %c32], + strides = [%c512, %c512, %c512, %c32, %c1] + : !pto.tensor_view<1x1x1x16x32xf32> + %src1_view = pto.make_tensor_view %src1_ptr, + shape = [%c1, %c1, %c1, %c16, %c8], + strides = [%c128, %c128, %c128, %c8, %c1] + : !pto.tensor_view<1x1x1x16x8xf32> + %dst_view = pto.make_tensor_view %dst_ptr, + shape = [%c1, %c1, %c1, %c16, %c32], + strides = [%c512, %c512, %c512, %c32, %c1] + : !pto.tensor_view<1x1x1x16x32xf32> + + %src0_part = pto.partition_view %src0_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c16, %c32] + : !pto.tensor_view<1x1x1x16x32xf32> -> !pto.partition_tensor_view<1x1x1x16x32xf32> + %src1_part = pto.partition_view %src1_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c16, %c8] + : !pto.tensor_view<1x1x1x16x8xf32> -> !pto.partition_tensor_view<1x1x1x16x8xf32> + %dst_part = pto.partition_view %dst_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c16, %c32] + : !pto.tensor_view<1x1x1x16x32xf32> -> !pto.partition_tensor_view<1x1x1x16x32xf32> + + %src0 = pto.alloc_tile + : !pto.tile_buf + %src1 = pto.alloc_tile + : !pto.tile_buf + %dst = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%src0_part : !pto.partition_tensor_view<1x1x1x16x32xf32>) + outs(%src0 : !pto.tile_buf) + pto.tload ins(%src1_part : !pto.partition_tensor_view<1x1x1x16x8xf32>) + outs(%src1 : !pto.tile_buf) + + pto.trowexpandmax ins(%src0, %src1 : !pto.tile_buf, !pto.tile_buf) outs(%dst : !pto.tile_buf) {src1Col = 1 : i64, src0eqdst = true} + + pto.tstore ins(%dst : !pto.tile_buf) + outs(%dst_part : !pto.partition_tensor_view<1x1x1x16x32xf32>) + return + } + + // f32_56x128: launchTRowExpandMax + + func.func @TROWEXPANDMAX_i32_16x32(%src0_ptr: !pto.ptr, %src1_ptr: !pto.ptr, %dst_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c8 = arith.constant 8 : index + %c16 = arith.constant 16 : index + %c32 = arith.constant 32 : index + %c128 = arith.constant 128 : index + %c512 = arith.constant 512 : index + + %src0_view = pto.make_tensor_view %src0_ptr, + shape = [%c1, %c1, %c1, %c16, %c32], + strides = [%c512, %c512, %c512, %c32, %c1] + : !pto.tensor_view<1x1x1x16x32xi32> + %src1_view = pto.make_tensor_view %src1_ptr, + shape = [%c1, %c1, %c1, %c16, %c8], + strides = [%c128, %c128, %c128, %c8, %c1] + : !pto.tensor_view<1x1x1x16x8xi32> + %dst_view = pto.make_tensor_view %dst_ptr, + shape = [%c1, %c1, %c1, %c16, %c32], + strides = [%c512, %c512, %c512, %c32, %c1] + : !pto.tensor_view<1x1x1x16x32xi32> + + %src0_part = pto.partition_view %src0_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c16, %c32] + : !pto.tensor_view<1x1x1x16x32xi32> -> !pto.partition_tensor_view<1x1x1x16x32xi32> + %src1_part = pto.partition_view %src1_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c16, %c8] + : !pto.tensor_view<1x1x1x16x8xi32> -> !pto.partition_tensor_view<1x1x1x16x8xi32> + %dst_part = pto.partition_view %dst_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c16, %c32] + : !pto.tensor_view<1x1x1x16x32xi32> -> !pto.partition_tensor_view<1x1x1x16x32xi32> + + %src0 = pto.alloc_tile + : !pto.tile_buf + %src1 = pto.alloc_tile + : !pto.tile_buf + %dst = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%src0_part : !pto.partition_tensor_view<1x1x1x16x32xi32>) + outs(%src0 : !pto.tile_buf) + pto.tload ins(%src1_part : !pto.partition_tensor_view<1x1x1x16x8xi32>) + outs(%src1 : !pto.tile_buf) + + pto.trowexpandmax ins(%src0, %src1 : !pto.tile_buf, !pto.tile_buf) outs(%dst : !pto.tile_buf) {src1Col = 1 : i64, src0eqdst = true} + + pto.tstore ins(%dst : !pto.tile_buf) + outs(%dst_part : !pto.partition_tensor_view<1x1x1x16x32xi32>) + return + } + + // i16_16x64: launchTRowExpandMax +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandmin/CMakeLists.txt b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandmin/CMakeLists.txt new file mode 100644 index 000000000..60b84b127 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandmin/CMakeLists.txt @@ -0,0 +1,9 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +pto_tilelang_vec_st(trowexpandmin) diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandmin/cases.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandmin/cases.py new file mode 100644 index 000000000..f9c800754 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandmin/cases.py @@ -0,0 +1,118 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +"""Single source of truth for trowexpandmin ST test cases. + +trowexpandmin: row-wise broadcast minimum. +- src1Col=1: only first column of src1 is valid, broadcast to dstCols +- src1Col>1: launchTRowExpandMin2 with different semantics (TBD) +- src1 physical cols = 32/sizeof(dtype) for NPU alignment +- src0eqdst: true means src0 shape equals dst shape +""" + +import numpy as np + +CASES = [ + # launchTRowExpandMin + { + "name": "f32_16x32", + "dtype": np.float32, + "src0_shape": (16, 32), + "src0_valid_shape": (16, 32), + "src1_shape": (16, 8), # physical: 32/sizeof(f32)=8 + "src1_valid_shape": (16, 1), # src1Col=1 + "dst_shape": (16, 32), + "dst_valid_shape": (16, 32), + "eps": 1e-6, + }, + # launchTRowExpandMin + { + "name": "f32_56x128", + "dtype": np.float32, + "src0_shape": (56, 128), + "src0_valid_shape": (56, 128), + "src1_shape": (56, 8), + "src1_valid_shape": (56, 1), + "dst_shape": (56, 128), + "dst_valid_shape": (56, 128), + "eps": 1e-6, + }, + # launchTRowExpandMin + { + "name": "f16_48x64", + "dtype": np.float16, + "src0_shape": (48, 64), + "src0_valid_shape": (48, 64), + "src1_shape": (48, 16), # physical: 32/sizeof(f16)=16 + "src1_valid_shape": (48, 1), + "dst_shape": (48, 64), + "dst_valid_shape": (48, 64), + "eps": 1e-3, + }, + # launchTRowExpandMin + { + "name": "f16_16x128", + "dtype": np.float16, + "src0_shape": (16, 128), + "src0_valid_shape": (16, 128), + "src1_shape": (16, 16), + "src1_valid_shape": (16, 1), + "dst_shape": (16, 128), + "dst_valid_shape": (16, 128), + "eps": 1e-3, + }, + # launchTRowExpandMin + { + "name": "f16_32x64_noeq", + "dtype": np.float16, + "src0_shape": (32, 64), # src0eqdst=false + "src0_valid_shape": (32, 64), + "src1_shape": (32, 16), + "src1_valid_shape": (32, 1), + "dst_shape": (32, 64), + "dst_valid_shape": (32, 64), + "eps": 1e-3, + }, + # launchTRowExpandMin + { + "name": "i32_16x32", + "dtype": np.int32, + "src0_shape": (16, 32), + "src0_valid_shape": (16, 32), + "src1_shape": (16, 8), # physical: 32/sizeof(i32)=8 + "src1_valid_shape": (16, 1), + "dst_shape": (16, 32), + "dst_valid_shape": (16, 32), + "eps": 0, + }, + # launchTRowExpandMin + { + "name": "i16_16x64", + "dtype": np.int16, + "src0_shape": (16, 64), + "src0_valid_shape": (16, 64), + "src1_shape": (16, 16), # physical: 32/sizeof(i16)=16 + "src1_valid_shape": (16, 1), + "dst_shape": (16, 64), + "dst_valid_shape": (16, 64), + "eps": 0, + }, + # Note: launchTRowExpandMin2 with src1Col>1 has different semantics - TBD + # - float, 24, 64, 24, 8, true (src1Col=8) + # - float, 20, 64, 20, 8, false (src1Col=8) +] + +_SMOKE_CASE_NAMES = ['f32_16x32', 'i32_16x32'] +_SMOKE_CASE_NAME_SET = set(_SMOKE_CASE_NAMES) +_missing = [name for name in _SMOKE_CASE_NAMES if name not in {case["name"] for case in CASES}] +if _missing: + raise RuntimeError("unknown smoke case(s): " + ", ".join(_missing)) +CASES = [case for case in CASES if case["name"] in _SMOKE_CASE_NAME_SET] diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandmin/compare.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandmin/compare.py new file mode 100644 index 000000000..8567813d0 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandmin/compare.py @@ -0,0 +1,61 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +"""Compare golden and output for trowexpandmin ST test cases.""" + +import os +import sys +import numpy as np + +from cases import CASES +from st_common import result_cmp, style_fail, style_pass + +# Inline validation for multi-input format (trowexpandmin uses src0/src1/dst) +REQUIRED_KEYS = {"name", "dtype", "src0_shape", "src0_valid_shape", "src1_shape", + "src1_valid_shape", "dst_shape", "dst_valid_shape"} +for i, case in enumerate(CASES): + missing = REQUIRED_KEYS - case.keys() + if missing: + raise ValueError(f"cases[{i}] ({case.get('name', '?')}) missing keys: {missing}") + + +def main(): + case_filter = sys.argv[1] if len(sys.argv) > 1 else None + + all_passed = True + for case in CASES: + if case_filter is not None and case["name"] != case_filter: + continue + + case_dir = case["name"] + dst_shape = case["dst_shape"] + dst_valid_shape = case["dst_valid_shape"] + dtype = case["dtype"] + + vr, vc = dst_valid_shape + + golden = np.fromfile(os.path.join(case_dir, "golden.bin"), dtype=dtype).reshape(dst_shape) + output = np.fromfile(os.path.join(case_dir, "output.bin"), dtype=dtype).reshape(dst_shape) + + ok = result_cmp(golden[:vr, :vc], output[:vr, :vc], case["eps"]) + if ok: + print(style_pass(f"[INFO] {case['name']}: compare passed")) + else: + print(style_fail(f"[ERROR] {case['name']}: compare failed")) + all_passed = False + + if not all_passed: + sys.exit(2) + print(style_pass("[INFO] all cases passed")) + + +if __name__ == "__main__": + main() diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandmin/gen_data.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandmin/gen_data.py new file mode 100644 index 000000000..a6cbae105 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandmin/gen_data.py @@ -0,0 +1,53 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +"""Generate input and golden data for trowexpandmin ST test cases. + +trowexpandmin: dst = min(src0, broadcast(src1)) +""" + +import numpy as np +from cases import CASES +from st_common import setup_case_rng, save_case_data + +# Inline validation for multi-input format (trowexpandmin uses src0/src1/dst) +REQUIRED_KEYS = {"name", "dtype", "src0_shape", "src0_valid_shape", "src1_shape", + "src1_valid_shape", "dst_shape", "dst_valid_shape"} +for i, case in enumerate(CASES): + missing = REQUIRED_KEYS - case.keys() + if missing: + raise ValueError(f"cases[{i}] ({case.get('name', '?')}) missing keys: {missing}") + +for case in CASES: + setup_case_rng(case) + + dtype = case["dtype"] + src0_shape = case["src0_shape"] + src0_valid_shape = case["src0_valid_shape"] + src1_shape = case["src1_shape"] + src1_valid_shape = case["src1_valid_shape"] + dst_shape = case["dst_shape"] + dst_valid_shape = case["dst_valid_shape"] + + input1 = np.random.randint(1, 10, size=src0_shape).astype(dtype) + input2 = np.random.randint(1, 10, size=src1_shape).astype(dtype) + + golden = np.zeros(dst_shape, dtype=dtype) + dst_vr, dst_vc = dst_valid_shape + src0_vr, src0_vc = src0_valid_shape + src1_vr = src1_valid_shape[0] + + golden[:dst_vr, :dst_vc] = np.minimum( + input1[:src0_vr, :src0_vc], np.broadcast_to(input2[:src1_vr, 0:1], (dst_vr, dst_vc)) + ).astype(dtype, copy=False) + + save_case_data(case["name"], {"input1": input1, "input2": input2, "golden": golden}) + print(f"[INFO] gen_data: {case['name']} dtype={dtype.__name__}") diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandmin/launch.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandmin/launch.cpp new file mode 100644 index 000000000..b70c51920 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandmin/launch.cpp @@ -0,0 +1,28 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +#include + +#ifndef AICORE +#define AICORE [aicore] +#endif + +// f32 kernels + +extern "C" __global__ AICORE void TROWEXPANDMIN_f32_16x32(__gm__ float *src0, __gm__ float *src1, __gm__ float *dst); +extern "C" __global__ AICORE void TROWEXPANDMIN_i32_16x32(__gm__ int32_t *src0, __gm__ int32_t *src1, __gm__ int32_t *dst); + +void LaunchTROWEXPANDMIN_i32_16x32(void *src0, void *src1, void *dst, void *stream) { + TROWEXPANDMIN_i32_16x32<<<1, nullptr, stream>>>((__gm__ int32_t *)src0, (__gm__ int32_t *)src1, (__gm__ int32_t *)dst); +} + + + +void LaunchTROWEXPANDMIN_f32_16x32(float *src0, float *src1, float *dst, void *stream) { + TROWEXPANDMIN_f32_16x32<<<1, nullptr, stream>>>((__gm__ float *)src0, (__gm__ float *)src1, (__gm__ float *)dst); +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandmin/main.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandmin/main.cpp new file mode 100644 index 000000000..22c136d50 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandmin/main.cpp @@ -0,0 +1,123 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// Host driver for TileLang trowexpandmin ST — row-wise broadcast minimum. +// Supports f32, f16, i32, i16 + +#include "acl/acl.h" +#include "test_common.h" +#include +#include +#include +#include +#include + +using namespace PtoTestCommon; + +// f32 kernels +void LaunchTROWEXPANDMIN_f32_16x32(float *src0, float *src1, float *dst, void *stream); +void LaunchTROWEXPANDMIN_f32_56x128(float *src0, float *src1, float *dst, void *stream); +// f16 kernels (use void* for aclFloat16) +void LaunchTROWEXPANDMIN_f16_16x128(void *src0, void *src1, void *dst, void *stream); +// i32 kernels +void LaunchTROWEXPANDMIN_i32_16x32(void *src0, void *src1, void *dst, void *stream); +// i16 kernels + +// Note: launchTRowExpandMin2 with src1Col>1 has different semantics - TBD + +using LaunchFn = void (*)(void *, void *, void *, void *); + +struct TestCase { + const char *name; + LaunchFn launch; + size_t src0Rows, src0Cols, src1Rows, src1Cols, dstRows, dstCols; + size_t dstValidRows, dstValidCols; + size_t elemSize; +}; + +static const TestCase kCases[] = { + // f32 cases +{"f32_16x32", (LaunchFn)LaunchTROWEXPANDMIN_f32_16x32, 16, 32, 16, 8, 16, 32, 16, 32, sizeof(float)}, +{"i32_16x32", LaunchTROWEXPANDMIN_i32_16x32, 16, 32, 16, 8, 16, 32, 16, 32, sizeof(int32_t)}, +}; +static constexpr size_t kNumCases = sizeof(kCases) / sizeof(kCases[0]); + +static int RunCase(const TestCase &tc, int deviceId, aclrtStream stream) { + int rc = 0; + size_t src0FileSize = tc.src0Rows * tc.src0Cols * tc.elemSize; + size_t src1FileSize = tc.src1Rows * tc.src1Cols * tc.elemSize; + const size_t dstFileSize = tc.dstRows * tc.dstCols * tc.elemSize; + + std::printf("[INFO] === case: %s (src0=%zux%zu, src1=%zux%zu, dst=%zux%zu) ===\n", + tc.name, tc.src0Rows, tc.src0Cols, tc.src1Rows, tc.src1Cols, tc.dstRows, tc.dstCols); + + std::string caseDir = std::string("./") + tc.name; + + void *src0Host = nullptr, *src1Host = nullptr, *dstHost = nullptr; + void *src0Device = nullptr, *src1Device = nullptr, *dstDevice = nullptr; + + aclrtMallocHost((void **)(&src0Host), src0FileSize); + aclrtMallocHost((void **)(&src1Host), src1FileSize); + aclrtMallocHost((void **)(&dstHost), dstFileSize); + aclrtMalloc((void **)&src0Device, src0FileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc((void **)&src1Device, src1FileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc((void **)&dstDevice, dstFileSize, ACL_MEM_MALLOC_HUGE_FIRST); + + if (!ReadFile((caseDir + "/input1.bin").c_str(), src0FileSize, src0Host, src0FileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input1.bin\n", caseDir.c_str()); + rc = 1; + } + if (rc == 0 && !ReadFile((caseDir + "/input2.bin").c_str(), src1FileSize, src1Host, src1FileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input2.bin\n", caseDir.c_str()); + rc = 1; + } + + if (rc == 0) { + aclrtMemcpy(src0Device, src0FileSize, src0Host, src0FileSize, ACL_MEMCPY_HOST_TO_DEVICE); + aclrtMemcpy(src1Device, src1FileSize, src1Host, src1FileSize, ACL_MEMCPY_HOST_TO_DEVICE); + tc.launch(src0Device, src1Device, dstDevice, stream); + aclrtSynchronizeStream(stream); + aclrtMemcpy(dstHost, dstFileSize, dstDevice, dstFileSize, ACL_MEMCPY_DEVICE_TO_HOST); + } + + if (rc == 0 && !WriteFile((caseDir + "/output.bin").c_str(), dstHost, dstFileSize)) { + std::fprintf(stderr, "[ERROR] failed to write %s/output.bin\n", caseDir.c_str()); + rc = 1; + } + + if (src0Device) aclrtFree(src0Device); + if (src1Device) aclrtFree(src1Device); + if (dstDevice) aclrtFree(dstDevice); + if (src0Host) aclrtFreeHost(src0Host); + if (src1Host) aclrtFreeHost(src1Host); + if (dstHost) aclrtFreeHost(dstHost); + + if (rc == 0) std::printf("[INFO] case %s done\n", tc.name); + return rc; +} + +int main(int argc, char *argv[]) { + const char *caseFilter = (argc > 1) ? argv[1] : nullptr; + int rc = 0, deviceId = 0; + aclrtStream stream = nullptr; + + aclInit(nullptr); + if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) deviceId = std::atoi(envDevice); + aclrtSetDevice(deviceId); + aclrtCreateStream(&stream); + + for (size_t i = 0; i < kNumCases; ++i) { + if (caseFilter && std::strcmp(kCases[i].name, caseFilter) != 0) continue; + if (RunCase(kCases[i], deviceId, stream) != 0) { rc = 1; break; } + } + + if (stream) aclrtDestroyStream(stream); + aclrtResetDevice(deviceId); + aclFinalize(); + return rc; +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandmin/trowexpandmin.pto b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandmin/trowexpandmin.pto new file mode 100644 index 000000000..b718657b3 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandmin/trowexpandmin.pto @@ -0,0 +1,125 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You can not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// TileLang ST kernels for pto.trowexpandmin: row-wise broadcast minimum. +// Supports f32, f16, i32, i16 + +module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { + // f32_16x32: launchTRowExpandMin + func.func @TROWEXPANDMIN_f32_16x32(%src0_ptr: !pto.ptr, %src1_ptr: !pto.ptr, %dst_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c8 = arith.constant 8 : index + %c16 = arith.constant 16 : index + %c32 = arith.constant 32 : index + %c128 = arith.constant 128 : index + %c512 = arith.constant 512 : index + + %src0_view = pto.make_tensor_view %src0_ptr, + shape = [%c1, %c1, %c1, %c16, %c32], + strides = [%c512, %c512, %c512, %c32, %c1] + : !pto.tensor_view<1x1x1x16x32xf32> + %src1_view = pto.make_tensor_view %src1_ptr, + shape = [%c1, %c1, %c1, %c16, %c8], + strides = [%c128, %c128, %c128, %c8, %c1] + : !pto.tensor_view<1x1x1x16x8xf32> + %dst_view = pto.make_tensor_view %dst_ptr, + shape = [%c1, %c1, %c1, %c16, %c32], + strides = [%c512, %c512, %c512, %c32, %c1] + : !pto.tensor_view<1x1x1x16x32xf32> + + %src0_part = pto.partition_view %src0_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c16, %c32] + : !pto.tensor_view<1x1x1x16x32xf32> -> !pto.partition_tensor_view<1x1x1x16x32xf32> + %src1_part = pto.partition_view %src1_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c16, %c8] + : !pto.tensor_view<1x1x1x16x8xf32> -> !pto.partition_tensor_view<1x1x1x16x8xf32> + %dst_part = pto.partition_view %dst_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c16, %c32] + : !pto.tensor_view<1x1x1x16x32xf32> -> !pto.partition_tensor_view<1x1x1x16x32xf32> + + %src0 = pto.alloc_tile + : !pto.tile_buf + %src1 = pto.alloc_tile + : !pto.tile_buf + %dst = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%src0_part : !pto.partition_tensor_view<1x1x1x16x32xf32>) + outs(%src0 : !pto.tile_buf) + pto.tload ins(%src1_part : !pto.partition_tensor_view<1x1x1x16x8xf32>) + outs(%src1 : !pto.tile_buf) + + pto.trowexpandmin ins(%src0, %src1 : !pto.tile_buf, !pto.tile_buf) outs(%dst : !pto.tile_buf) {src1Col = 1 : i64, src0eqdst = true} + + pto.tstore ins(%dst : !pto.tile_buf) + outs(%dst_part : !pto.partition_tensor_view<1x1x1x16x32xf32>) + return + } + + // f32_56x128: launchTRowExpandMin + + func.func @TROWEXPANDMIN_i32_16x32(%src0_ptr: !pto.ptr, %src1_ptr: !pto.ptr, %dst_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c8 = arith.constant 8 : index + %c16 = arith.constant 16 : index + %c32 = arith.constant 32 : index + %c128 = arith.constant 128 : index + %c512 = arith.constant 512 : index + + %src0_view = pto.make_tensor_view %src0_ptr, + shape = [%c1, %c1, %c1, %c16, %c32], + strides = [%c512, %c512, %c512, %c32, %c1] + : !pto.tensor_view<1x1x1x16x32xi32> + %src1_view = pto.make_tensor_view %src1_ptr, + shape = [%c1, %c1, %c1, %c16, %c8], + strides = [%c128, %c128, %c128, %c8, %c1] + : !pto.tensor_view<1x1x1x16x8xi32> + %dst_view = pto.make_tensor_view %dst_ptr, + shape = [%c1, %c1, %c1, %c16, %c32], + strides = [%c512, %c512, %c512, %c32, %c1] + : !pto.tensor_view<1x1x1x16x32xi32> + + %src0_part = pto.partition_view %src0_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c16, %c32] + : !pto.tensor_view<1x1x1x16x32xi32> -> !pto.partition_tensor_view<1x1x1x16x32xi32> + %src1_part = pto.partition_view %src1_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c16, %c8] + : !pto.tensor_view<1x1x1x16x8xi32> -> !pto.partition_tensor_view<1x1x1x16x8xi32> + %dst_part = pto.partition_view %dst_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c16, %c32] + : !pto.tensor_view<1x1x1x16x32xi32> -> !pto.partition_tensor_view<1x1x1x16x32xi32> + + %src0 = pto.alloc_tile + : !pto.tile_buf + %src1 = pto.alloc_tile + : !pto.tile_buf + %dst = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%src0_part : !pto.partition_tensor_view<1x1x1x16x32xi32>) + outs(%src0 : !pto.tile_buf) + pto.tload ins(%src1_part : !pto.partition_tensor_view<1x1x1x16x8xi32>) + outs(%src1 : !pto.tile_buf) + + pto.trowexpandmin ins(%src0, %src1 : !pto.tile_buf, !pto.tile_buf) outs(%dst : !pto.tile_buf) {src1Col = 1 : i64, src0eqdst = true} + + pto.tstore ins(%dst : !pto.tile_buf) + outs(%dst_part : !pto.partition_tensor_view<1x1x1x16x32xi32>) + return + } + + // i16_16x64: launchTRowExpandMin +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandmul/CMakeLists.txt b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandmul/CMakeLists.txt new file mode 100644 index 000000000..e85881c67 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandmul/CMakeLists.txt @@ -0,0 +1,9 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +pto_tilelang_vec_st(trowexpandmul) diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandmul/cases.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandmul/cases.py new file mode 100644 index 000000000..c9c6f3742 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandmul/cases.py @@ -0,0 +1,118 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +"""Single source of truth for trowexpandmul ST test cases. + +trowexpandmul: row-wise broadcast multiplication. +- src1Col=1: only first column of src1 is valid, broadcast to dstCols +- src1Col>1: launchTRowExpandMul2 with different semantics (TBD) +- src1 physical cols = 32/sizeof(dtype) for NPU alignment +- src0eqdst: true means src0 shape equals dst shape +""" + +import numpy as np + +CASES = [ + # launchTRowExpandMul + { + "name": "f32_16x32", + "dtype": np.float32, + "src0_shape": (16, 32), + "src0_valid_shape": (16, 32), + "src1_shape": (16, 8), # physical: 32/sizeof(f32)=8 + "src1_valid_shape": (16, 1), # src1Col=1 + "dst_shape": (16, 32), + "dst_valid_shape": (16, 32), + "eps": 1e-6, + }, + # launchTRowExpandMul + { + "name": "f32_56x128", + "dtype": np.float32, + "src0_shape": (56, 128), + "src0_valid_shape": (56, 128), + "src1_shape": (56, 8), + "src1_valid_shape": (56, 1), + "dst_shape": (56, 128), + "dst_valid_shape": (56, 128), + "eps": 1e-6, + }, + # launchTRowExpandMul + { + "name": "f16_48x64", + "dtype": np.float16, + "src0_shape": (48, 64), + "src0_valid_shape": (48, 64), + "src1_shape": (48, 16), # physical: 32/sizeof(f16)=16 + "src1_valid_shape": (48, 1), + "dst_shape": (48, 64), + "dst_valid_shape": (48, 64), + "eps": 1e-3, + }, + # launchTRowExpandMul + { + "name": "f16_16x128", + "dtype": np.float16, + "src0_shape": (16, 128), + "src0_valid_shape": (16, 128), + "src1_shape": (16, 16), + "src1_valid_shape": (16, 1), + "dst_shape": (16, 128), + "dst_valid_shape": (16, 128), + "eps": 1e-3, + }, + # launchTRowExpandMul + { + "name": "f16_32x64_noeq", + "dtype": np.float16, + "src0_shape": (32, 64), # src0eqdst=false + "src0_valid_shape": (32, 64), + "src1_shape": (32, 16), + "src1_valid_shape": (32, 1), + "dst_shape": (32, 64), + "dst_valid_shape": (32, 64), + "eps": 1e-3, + }, + # launchTRowExpandMul + { + "name": "i32_16x32", + "dtype": np.int32, + "src0_shape": (16, 32), + "src0_valid_shape": (16, 32), + "src1_shape": (16, 8), # physical: 32/sizeof(i32)=8 + "src1_valid_shape": (16, 1), + "dst_shape": (16, 32), + "dst_valid_shape": (16, 32), + "eps": 0, + }, + # launchTRowExpandMul + { + "name": "i16_16x64", + "dtype": np.int16, + "src0_shape": (16, 64), + "src0_valid_shape": (16, 64), + "src1_shape": (16, 16), # physical: 32/sizeof(i16)=16 + "src1_valid_shape": (16, 1), + "dst_shape": (16, 64), + "dst_valid_shape": (16, 64), + "eps": 0, + }, + # Note: launchTRowExpandMul2 with src1Col>1 has different semantics - TBD + # - float, 24, 64, 24, 8, true (src1Col=8) + # - float, 20, 64, 20, 8, false (src1Col=8) +] + +_SMOKE_CASE_NAMES = ['f32_16x32', 'i32_16x32'] +_SMOKE_CASE_NAME_SET = set(_SMOKE_CASE_NAMES) +_missing = [name for name in _SMOKE_CASE_NAMES if name not in {case["name"] for case in CASES}] +if _missing: + raise RuntimeError("unknown smoke case(s): " + ", ".join(_missing)) +CASES = [case for case in CASES if case["name"] in _SMOKE_CASE_NAME_SET] diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandmul/compare.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandmul/compare.py new file mode 100644 index 000000000..f0098f607 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandmul/compare.py @@ -0,0 +1,61 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +"""Compare golden and output for trowexpandmul ST test cases.""" + +import os +import sys +import numpy as np + +from cases import CASES +from st_common import result_cmp, style_fail, style_pass + +# Inline validation for multi-input format (trowexpandmul uses src0/src1/dst) +REQUIRED_KEYS = {"name", "dtype", "src0_shape", "src0_valid_shape", "src1_shape", + "src1_valid_shape", "dst_shape", "dst_valid_shape"} +for i, case in enumerate(CASES): + missing = REQUIRED_KEYS - case.keys() + if missing: + raise ValueError(f"cases[{i}] ({case.get('name', '?')}) missing keys: {missing}") + + +def main(): + case_filter = sys.argv[1] if len(sys.argv) > 1 else None + + all_passed = True + for case in CASES: + if case_filter is not None and case["name"] != case_filter: + continue + + case_dir = case["name"] + dst_shape = case["dst_shape"] + dst_valid_shape = case["dst_valid_shape"] + dtype = case["dtype"] + + vr, vc = dst_valid_shape + + golden = np.fromfile(os.path.join(case_dir, "golden.bin"), dtype=dtype).reshape(dst_shape) + output = np.fromfile(os.path.join(case_dir, "output.bin"), dtype=dtype).reshape(dst_shape) + + ok = result_cmp(golden[:vr, :vc], output[:vr, :vc], case["eps"]) + if ok: + print(style_pass(f"[INFO] {case['name']}: compare passed")) + else: + print(style_fail(f"[ERROR] {case['name']}: compare failed")) + all_passed = False + + if not all_passed: + sys.exit(2) + print(style_pass("[INFO] all cases passed")) + + +if __name__ == "__main__": + main() diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandmul/gen_data.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandmul/gen_data.py new file mode 100644 index 000000000..7cd58f7d1 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandmul/gen_data.py @@ -0,0 +1,53 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +"""Generate input and golden data for trowexpandmul ST test cases. + +trowexpandmul: dst = src0 * broadcast(src1) +""" + +import numpy as np +from cases import CASES +from st_common import setup_case_rng, save_case_data + +# Inline validation for multi-input format (trowexpandmul uses src0/src1/dst) +REQUIRED_KEYS = {"name", "dtype", "src0_shape", "src0_valid_shape", "src1_shape", + "src1_valid_shape", "dst_shape", "dst_valid_shape"} +for i, case in enumerate(CASES): + missing = REQUIRED_KEYS - case.keys() + if missing: + raise ValueError(f"cases[{i}] ({case.get('name', '?')}) missing keys: {missing}") + +for case in CASES: + setup_case_rng(case) + + dtype = case["dtype"] + src0_shape = case["src0_shape"] + src0_valid_shape = case["src0_valid_shape"] + src1_shape = case["src1_shape"] + src1_valid_shape = case["src1_valid_shape"] + dst_shape = case["dst_shape"] + dst_valid_shape = case["dst_valid_shape"] + + input1 = np.random.randint(1, 10, size=src0_shape).astype(dtype) + input2 = np.random.randint(1, 10, size=src1_shape).astype(dtype) + + golden = np.zeros(dst_shape, dtype=dtype) + dst_vr, dst_vc = dst_valid_shape + src0_vr, src0_vc = src0_valid_shape + src1_vr = src1_valid_shape[0] + + golden[:dst_vr, :dst_vc] = ( + input1[:src0_vr, :src0_vc] * input2[:src1_vr, 0:1] + ).astype(dtype, copy=False) + + save_case_data(case["name"], {"input1": input1, "input2": input2, "golden": golden}) + print(f"[INFO] gen_data: {case['name']} dtype={dtype.__name__}") diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandmul/launch.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandmul/launch.cpp new file mode 100644 index 000000000..b11e6e40f --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandmul/launch.cpp @@ -0,0 +1,27 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +#include + +#ifndef AICORE +#define AICORE [aicore] +#endif + +// f32 kernels + +extern "C" __global__ AICORE void TROWEXPANDMUL_f32_16x32(__gm__ float *src0, __gm__ float *src1, __gm__ float *dst); +extern "C" __global__ AICORE void TROWEXPANDMUL_i32_16x32(__gm__ int32_t *src0, __gm__ int32_t *src1, __gm__ int32_t *dst); + +void LaunchTROWEXPANDMUL_f32_16x32(float *src0, float *src1, float *dst, void *stream) { + TROWEXPANDMUL_f32_16x32<<<1, nullptr, stream>>>((__gm__ float *)src0, (__gm__ float *)src1, (__gm__ float *)dst); +} + + +void LaunchTROWEXPANDMUL_i32_16x32(void *src0, void *src1, void *dst, void *stream) { + TROWEXPANDMUL_i32_16x32<<<1, nullptr, stream>>>((__gm__ int32_t *)src0, (__gm__ int32_t *)src1, (__gm__ int32_t *)dst); +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandmul/main.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandmul/main.cpp new file mode 100644 index 000000000..96173a26f --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandmul/main.cpp @@ -0,0 +1,123 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// Host driver for TileLang trowexpandmul ST — row-wise broadcast multiplication. +// Supports f32, f16, i32, i16 + +#include "acl/acl.h" +#include "test_common.h" +#include +#include +#include +#include +#include + +using namespace PtoTestCommon; + +// f32 kernels +void LaunchTROWEXPANDMUL_f32_16x32(float *src0, float *src1, float *dst, void *stream); +void LaunchTROWEXPANDMUL_f32_56x128(float *src0, float *src1, float *dst, void *stream); +// f16 kernels (use void* for aclFloat16) +void LaunchTROWEXPANDMUL_f16_16x128(void *src0, void *src1, void *dst, void *stream); +// i32 kernels +void LaunchTROWEXPANDMUL_i32_16x32(void *src0, void *src1, void *dst, void *stream); +// i16 kernels + +// Note: launchTRowExpandMul2 with src1Col>1 has different semantics - TBD + +using LaunchFn = void (*)(void *, void *, void *, void *); + +struct TestCase { + const char *name; + LaunchFn launch; + size_t src0Rows, src0Cols, src1Rows, src1Cols, dstRows, dstCols; + size_t dstValidRows, dstValidCols; + size_t elemSize; +}; + +static const TestCase kCases[] = { + // f32 cases +{"f32_16x32", (LaunchFn)LaunchTROWEXPANDMUL_f32_16x32, 16, 32, 16, 8, 16, 32, 16, 32, sizeof(float)}, +{"i32_16x32", LaunchTROWEXPANDMUL_i32_16x32, 16, 32, 16, 8, 16, 32, 16, 32, sizeof(int32_t)}, +}; +static constexpr size_t kNumCases = sizeof(kCases) / sizeof(kCases[0]); + +static int RunCase(const TestCase &tc, int deviceId, aclrtStream stream) { + int rc = 0; + size_t src0FileSize = tc.src0Rows * tc.src0Cols * tc.elemSize; + size_t src1FileSize = tc.src1Rows * tc.src1Cols * tc.elemSize; + const size_t dstFileSize = tc.dstRows * tc.dstCols * tc.elemSize; + + std::printf("[INFO] === case: %s (src0=%zux%zu, src1=%zux%zu, dst=%zux%zu) ===\n", + tc.name, tc.src0Rows, tc.src0Cols, tc.src1Rows, tc.src1Cols, tc.dstRows, tc.dstCols); + + std::string caseDir = std::string("./") + tc.name; + + void *src0Host = nullptr, *src1Host = nullptr, *dstHost = nullptr; + void *src0Device = nullptr, *src1Device = nullptr, *dstDevice = nullptr; + + aclrtMallocHost((void **)(&src0Host), src0FileSize); + aclrtMallocHost((void **)(&src1Host), src1FileSize); + aclrtMallocHost((void **)(&dstHost), dstFileSize); + aclrtMalloc((void **)&src0Device, src0FileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc((void **)&src1Device, src1FileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc((void **)&dstDevice, dstFileSize, ACL_MEM_MALLOC_HUGE_FIRST); + + if (!ReadFile((caseDir + "/input1.bin").c_str(), src0FileSize, src0Host, src0FileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input1.bin\n", caseDir.c_str()); + rc = 1; + } + if (rc == 0 && !ReadFile((caseDir + "/input2.bin").c_str(), src1FileSize, src1Host, src1FileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input2.bin\n", caseDir.c_str()); + rc = 1; + } + + if (rc == 0) { + aclrtMemcpy(src0Device, src0FileSize, src0Host, src0FileSize, ACL_MEMCPY_HOST_TO_DEVICE); + aclrtMemcpy(src1Device, src1FileSize, src1Host, src1FileSize, ACL_MEMCPY_HOST_TO_DEVICE); + tc.launch(src0Device, src1Device, dstDevice, stream); + aclrtSynchronizeStream(stream); + aclrtMemcpy(dstHost, dstFileSize, dstDevice, dstFileSize, ACL_MEMCPY_DEVICE_TO_HOST); + } + + if (rc == 0 && !WriteFile((caseDir + "/output.bin").c_str(), dstHost, dstFileSize)) { + std::fprintf(stderr, "[ERROR] failed to write %s/output.bin\n", caseDir.c_str()); + rc = 1; + } + + if (src0Device) aclrtFree(src0Device); + if (src1Device) aclrtFree(src1Device); + if (dstDevice) aclrtFree(dstDevice); + if (src0Host) aclrtFreeHost(src0Host); + if (src1Host) aclrtFreeHost(src1Host); + if (dstHost) aclrtFreeHost(dstHost); + + if (rc == 0) std::printf("[INFO] case %s done\n", tc.name); + return rc; +} + +int main(int argc, char *argv[]) { + const char *caseFilter = (argc > 1) ? argv[1] : nullptr; + int rc = 0, deviceId = 0; + aclrtStream stream = nullptr; + + aclInit(nullptr); + if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) deviceId = std::atoi(envDevice); + aclrtSetDevice(deviceId); + aclrtCreateStream(&stream); + + for (size_t i = 0; i < kNumCases; ++i) { + if (caseFilter && std::strcmp(kCases[i].name, caseFilter) != 0) continue; + if (RunCase(kCases[i], deviceId, stream) != 0) { rc = 1; break; } + } + + if (stream) aclrtDestroyStream(stream); + aclrtResetDevice(deviceId); + aclFinalize(); + return rc; +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandmul/trowexpandmul.pto b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandmul/trowexpandmul.pto new file mode 100644 index 000000000..08eb8ac62 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandmul/trowexpandmul.pto @@ -0,0 +1,72 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You can not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// TileLang ST kernels for pto.trowexpandmul: row-wise broadcast multiplication. + +module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { + // f32_16x32: dstRow=16, dstCol=32, src1Row=16, src1Col=1, src0eqdst=true + func.func @TROWEXPANDMUL_f32_16x32(%src0_ptr: !pto.ptr, %src1_ptr: !pto.ptr, %dst_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c8 = arith.constant 8 : index + %c16 = arith.constant 16 : index + %c32 = arith.constant 32 : index + %c128 = arith.constant 128 : index + %c512 = arith.constant 512 : index + + %src0_view = pto.make_tensor_view %src0_ptr, shape = [%c1, %c1, %c1, %c16, %c32], strides = [%c512, %c512, %c512, %c32, %c1] : !pto.tensor_view<1x1x1x16x32xf32> + %src1_view = pto.make_tensor_view %src1_ptr, shape = [%c1, %c1, %c1, %c16, %c8], strides = [%c128, %c128, %c128, %c8, %c1] : !pto.tensor_view<1x1x1x16x8xf32> + %dst_view = pto.make_tensor_view %dst_ptr, shape = [%c1, %c1, %c1, %c16, %c32], strides = [%c512, %c512, %c512, %c32, %c1] : !pto.tensor_view<1x1x1x16x32xf32> + + %src0_part = pto.partition_view %src0_view, offsets = [%c0, %c0, %c0, %c0, %c0], sizes = [%c1, %c1, %c1, %c16, %c32] : !pto.tensor_view<1x1x1x16x32xf32> -> !pto.partition_tensor_view<1x1x1x16x32xf32> + %src1_part = pto.partition_view %src1_view, offsets = [%c0, %c0, %c0, %c0, %c0], sizes = [%c1, %c1, %c1, %c16, %c8] : !pto.tensor_view<1x1x1x16x8xf32> -> !pto.partition_tensor_view<1x1x1x16x8xf32> + %dst_part = pto.partition_view %dst_view, offsets = [%c0, %c0, %c0, %c0, %c0], sizes = [%c1, %c1, %c1, %c16, %c32] : !pto.tensor_view<1x1x1x16x32xf32> -> !pto.partition_tensor_view<1x1x1x16x32xf32> + + %src0 = pto.alloc_tile : !pto.tile_buf + %src1 = pto.alloc_tile : !pto.tile_buf + %dst = pto.alloc_tile : !pto.tile_buf + + pto.tload ins(%src0_part : !pto.partition_tensor_view<1x1x1x16x32xf32>) outs(%src0 : !pto.tile_buf) + pto.tload ins(%src1_part : !pto.partition_tensor_view<1x1x1x16x8xf32>) outs(%src1 : !pto.tile_buf) + pto.trowexpandmul ins(%src0, %src1 : !pto.tile_buf, !pto.tile_buf) outs(%dst : !pto.tile_buf) {src1Col = 1 : i64, src0eqdst = true, highPrecision = false} + pto.tstore ins(%dst : !pto.tile_buf) outs(%dst_part : !pto.partition_tensor_view<1x1x1x16x32xf32>) + return + } + + // f32_56x128: dstRow=56, dstCol=128, src1Row=56, src1Col=1, src0eqdst=true + + func.func @TROWEXPANDMUL_i32_16x32(%src0_ptr: !pto.ptr, %src1_ptr: !pto.ptr, %dst_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c8 = arith.constant 8 : index + %c16 = arith.constant 16 : index + %c32 = arith.constant 32 : index + %c128 = arith.constant 128 : index + %c512 = arith.constant 512 : index + + %src0_view = pto.make_tensor_view %src0_ptr, shape = [%c1, %c1, %c1, %c16, %c32], strides = [%c512, %c512, %c512, %c32, %c1] : !pto.tensor_view<1x1x1x16x32xi32> + %src1_view = pto.make_tensor_view %src1_ptr, shape = [%c1, %c1, %c1, %c16, %c8], strides = [%c128, %c128, %c128, %c8, %c1] : !pto.tensor_view<1x1x1x16x8xi32> + %dst_view = pto.make_tensor_view %dst_ptr, shape = [%c1, %c1, %c1, %c16, %c32], strides = [%c512, %c512, %c512, %c32, %c1] : !pto.tensor_view<1x1x1x16x32xi32> + + %src0_part = pto.partition_view %src0_view, offsets = [%c0, %c0, %c0, %c0, %c0], sizes = [%c1, %c1, %c1, %c16, %c32] : !pto.tensor_view<1x1x1x16x32xi32> -> !pto.partition_tensor_view<1x1x1x16x32xi32> + %src1_part = pto.partition_view %src1_view, offsets = [%c0, %c0, %c0, %c0, %c0], sizes = [%c1, %c1, %c1, %c16, %c8] : !pto.tensor_view<1x1x1x16x8xi32> -> !pto.partition_tensor_view<1x1x1x16x8xi32> + %dst_part = pto.partition_view %dst_view, offsets = [%c0, %c0, %c0, %c0, %c0], sizes = [%c1, %c1, %c1, %c16, %c32] : !pto.tensor_view<1x1x1x16x32xi32> -> !pto.partition_tensor_view<1x1x1x16x32xi32> + + %src0 = pto.alloc_tile : !pto.tile_buf + %src1 = pto.alloc_tile : !pto.tile_buf + %dst = pto.alloc_tile : !pto.tile_buf + + pto.tload ins(%src0_part : !pto.partition_tensor_view<1x1x1x16x32xi32>) outs(%src0 : !pto.tile_buf) + pto.tload ins(%src1_part : !pto.partition_tensor_view<1x1x1x16x8xi32>) outs(%src1 : !pto.tile_buf) + pto.trowexpandmul ins(%src0, %src1 : !pto.tile_buf, !pto.tile_buf) outs(%dst : !pto.tile_buf) {src1Col = 1 : i64, src0eqdst = true, highPrecision = false} + pto.tstore ins(%dst : !pto.tile_buf) outs(%dst_part : !pto.partition_tensor_view<1x1x1x16x32xi32>) + return + } + + // i16_16x64: dstRow=16, dstCol=64, src1Row=16, src1Col=1, src0eqdst=true +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandsub/CMakeLists.txt b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandsub/CMakeLists.txt new file mode 100644 index 000000000..2accb3cc7 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandsub/CMakeLists.txt @@ -0,0 +1,9 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +pto_tilelang_vec_st(trowexpandsub) diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandsub/cases.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandsub/cases.py new file mode 100644 index 000000000..33638fc41 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandsub/cases.py @@ -0,0 +1,118 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +"""Single source of truth for trowexpandsub ST test cases. + +trowexpandsub: row-wise broadcast subtraction. +- src1Col=1: only first column of src1 is valid, broadcast to dstCols +- src1Col>1: launchTRowExpandSub2 with different semantics (TBD) +- src1 physical cols = 32/sizeof(dtype) for NPU alignment +- src0eqdst: true means src0 shape equals dst shape +""" + +import numpy as np + +CASES = [ + # launchTRowExpandSub + { + "name": "f32_8x128", + "dtype": np.float32, + "src0_shape": (8, 128), + "src0_valid_shape": (8, 128), + "src1_shape": (8, 8), # physical: 32/sizeof(f32)=8 + "src1_valid_shape": (8, 1), # src1Col=1 + "dst_shape": (8, 128), + "dst_valid_shape": (8, 128), + "eps": 1e-6, + }, + # launchTRowExpandSub + { + "name": "f32_24x32", + "dtype": np.float32, + "src0_shape": (24, 32), + "src0_valid_shape": (24, 32), + "src1_shape": (24, 8), + "src1_valid_shape": (24, 1), + "dst_shape": (24, 32), + "dst_valid_shape": (24, 32), + "eps": 1e-6, + }, + # launchTRowExpandSub + { + "name": "f16_16x256", + "dtype": np.float16, + "src0_shape": (16, 256), + "src0_valid_shape": (16, 256), + "src1_shape": (16, 16), # physical: 32/sizeof(f16)=16 + "src1_valid_shape": (16, 1), + "dst_shape": (16, 256), + "dst_valid_shape": (16, 256), + "eps": 1e-3, + }, + # launchTRowExpandSub + { + "name": "f16_32x64", + "dtype": np.float16, + "src0_shape": (32, 64), + "src0_valid_shape": (32, 64), + "src1_shape": (32, 16), + "src1_valid_shape": (32, 1), + "dst_shape": (32, 64), + "dst_valid_shape": (32, 64), + "eps": 1e-3, + }, + # launchTRowExpandSub + { + "name": "f32_16x128_noeq", + "dtype": np.float32, + "src0_shape": (16, 128), # src0eqdst=false + "src0_valid_shape": (16, 128), + "src1_shape": (16, 8), + "src1_valid_shape": (16, 1), + "dst_shape": (16, 128), + "dst_valid_shape": (16, 128), + "eps": 1e-6, + }, + # launchTRowExpandSub + { + "name": "i32_16x32", + "dtype": np.int32, + "src0_shape": (16, 32), + "src0_valid_shape": (16, 32), + "src1_shape": (16, 8), # physical: 32/sizeof(i32)=8 + "src1_valid_shape": (16, 1), + "dst_shape": (16, 32), + "dst_valid_shape": (16, 32), + "eps": 0, + }, + # launchTRowExpandSub + { + "name": "i16_16x64", + "dtype": np.int16, + "src0_shape": (16, 64), + "src0_valid_shape": (16, 64), + "src1_shape": (16, 16), # physical: 32/sizeof(i16)=16 + "src1_valid_shape": (16, 1), + "dst_shape": (16, 64), + "dst_valid_shape": (16, 64), + "eps": 0, + }, + # Note: launchTRowExpandSub2 with src1Col>1 has different semantics - TBD + # - float, 24, 64, 24, 8, true (src1Col=8) + # - aclFloat16, 16, 64, 16, 16, false (src1Col=16) +] + +_SMOKE_CASE_NAMES = ['f32_24x32', 'i32_16x32'] +_SMOKE_CASE_NAME_SET = set(_SMOKE_CASE_NAMES) +_missing = [name for name in _SMOKE_CASE_NAMES if name not in {case["name"] for case in CASES}] +if _missing: + raise RuntimeError("unknown smoke case(s): " + ", ".join(_missing)) +CASES = [case for case in CASES if case["name"] in _SMOKE_CASE_NAME_SET] diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandsub/compare.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandsub/compare.py new file mode 100644 index 000000000..dd6523a7d --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandsub/compare.py @@ -0,0 +1,61 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +"""Compare golden and output for trowexpandsub ST test cases.""" + +import os +import sys +import numpy as np + +from cases import CASES +from st_common import result_cmp, style_fail, style_pass + +# Inline validation for multi-input format (trowexpandsub uses src0/src1/dst) +REQUIRED_KEYS = {"name", "dtype", "src0_shape", "src0_valid_shape", "src1_shape", + "src1_valid_shape", "dst_shape", "dst_valid_shape"} +for i, case in enumerate(CASES): + missing = REQUIRED_KEYS - case.keys() + if missing: + raise ValueError(f"cases[{i}] ({case.get('name', '?')}) missing keys: {missing}") + + +def main(): + case_filter = sys.argv[1] if len(sys.argv) > 1 else None + + all_passed = True + for case in CASES: + if case_filter is not None and case["name"] != case_filter: + continue + + case_dir = case["name"] + dst_shape = case["dst_shape"] + dst_valid_shape = case["dst_valid_shape"] + dtype = case["dtype"] + + vr, vc = dst_valid_shape + + golden = np.fromfile(os.path.join(case_dir, "golden.bin"), dtype=dtype).reshape(dst_shape) + output = np.fromfile(os.path.join(case_dir, "output.bin"), dtype=dtype).reshape(dst_shape) + + ok = result_cmp(golden[:vr, :vc], output[:vr, :vc], case["eps"]) + if ok: + print(style_pass(f"[INFO] {case['name']}: compare passed")) + else: + print(style_fail(f"[ERROR] {case['name']}: compare failed")) + all_passed = False + + if not all_passed: + sys.exit(2) + print(style_pass("[INFO] all cases passed")) + + +if __name__ == "__main__": + main() diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandsub/gen_data.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandsub/gen_data.py new file mode 100644 index 000000000..e09082b34 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandsub/gen_data.py @@ -0,0 +1,54 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +"""Generate input and golden data for trowexpandsub ST test cases. + +trowexpandsub: dst = src0 - broadcast(src1) +""" + +import numpy as np +from cases import CASES +from st_common import setup_case_rng, save_case_data + +# Inline validation for multi-input format (trowexpandsub uses src0/src1/dst) +REQUIRED_KEYS = {"name", "dtype", "src0_shape", "src0_valid_shape", "src1_shape", + "src1_valid_shape", "dst_shape", "dst_valid_shape"} +for i, case in enumerate(CASES): + missing = REQUIRED_KEYS - case.keys() + if missing: + raise ValueError(f"cases[{i}] ({case.get('name', '?')}) missing keys: {missing}") + +for case in CASES: + setup_case_rng(case) + + dtype = case["dtype"] + src0_shape = case["src0_shape"] + src0_valid_shape = case["src0_valid_shape"] + src1_shape = case["src1_shape"] + src1_valid_shape = case["src1_valid_shape"] + dst_shape = case["dst_shape"] + dst_valid_shape = case["dst_valid_shape"] + + input1 = np.random.randint(1, 10, size=src0_shape).astype(dtype) + input2 = np.random.randint(1, 10, size=src1_shape).astype(dtype) + + golden = np.zeros(dst_shape, dtype=dtype) + dst_vr, dst_vc = dst_valid_shape + src0_vr, src0_vc = src0_valid_shape + src1_vr = src1_valid_shape[0] + + # dst = src0 - src1_scalar (broadcasted) + golden[:dst_vr, :dst_vc] = ( + input1[:src0_vr, :src0_vc] - input2[:src1_vr, 0:1] + ).astype(dtype, copy=False) + + save_case_data(case["name"], {"input1": input1, "input2": input2, "golden": golden}) + print(f"[INFO] gen_data: {case['name']} src0={src0_shape} src1={src1_shape} dst={dst_shape} dtype={dtype.__name__}") diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandsub/launch.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandsub/launch.cpp new file mode 100644 index 000000000..c65358e57 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandsub/launch.cpp @@ -0,0 +1,27 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +#include + +#ifndef AICORE +#define AICORE [aicore] +#endif + +// f32 kernels + +extern "C" __global__ AICORE void TROWEXPANDSUB_f32_24x32(__gm__ float *src0, __gm__ float *src1, __gm__ float *dst); +extern "C" __global__ AICORE void TROWEXPANDSUB_i32_16x32(__gm__ int32_t *src0, __gm__ int32_t *src1, __gm__ int32_t *dst); + +void LaunchTROWEXPANDSUB_f32_24x32(float *src0, float *src1, float *dst, void *stream) { + TROWEXPANDSUB_f32_24x32<<<1, nullptr, stream>>>((__gm__ float *)src0, (__gm__ float *)src1, (__gm__ float *)dst); +} + + +void LaunchTROWEXPANDSUB_i32_16x32(void *src0, void *src1, void *dst, void *stream) { + TROWEXPANDSUB_i32_16x32<<<1, nullptr, stream>>>((__gm__ int32_t *)src0, (__gm__ int32_t *)src1, (__gm__ int32_t *)dst); +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandsub/main.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandsub/main.cpp new file mode 100644 index 000000000..1820a83a0 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandsub/main.cpp @@ -0,0 +1,122 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// Host driver for TileLang trowexpandsub ST — row-wise broadcast subtraction. +// Supports f32, f16, i32, i16 + +#include "acl/acl.h" +#include "test_common.h" +#include +#include +#include +#include +#include + +using namespace PtoTestCommon; + +// f32 kernels +void LaunchTROWEXPANDSUB_f32_24x32(float *src0, float *src1, float *dst, void *stream); +// f16 kernels (use void* for aclFloat16) +void LaunchTROWEXPANDSUB_f16_32x64(void *src0, void *src1, void *dst, void *stream); +// i32 kernels +void LaunchTROWEXPANDSUB_i32_16x32(void *src0, void *src1, void *dst, void *stream); +// i16 kernels + +// Note: launchTRowExpandSub2 with src1Col>1 has different semantics - TBD + +using LaunchFn = void (*)(void *, void *, void *, void *); + +struct TestCase { + const char *name; + LaunchFn launch; + size_t src0Rows, src0Cols, src1Rows, src1Cols, dstRows, dstCols; + size_t dstValidRows, dstValidCols; + size_t elemSize; +}; + +static const TestCase kCases[] = { + // f32 cases +{"f32_24x32", (LaunchFn)LaunchTROWEXPANDSUB_f32_24x32, 24, 32, 24, 8, 24, 32, 24, 32, sizeof(float)}, +{"i32_16x32", LaunchTROWEXPANDSUB_i32_16x32, 16, 32, 16, 8, 16, 32, 16, 32, sizeof(int32_t)}, +}; +static constexpr size_t kNumCases = sizeof(kCases) / sizeof(kCases[0]); + +static int RunCase(const TestCase &tc, int deviceId, aclrtStream stream) { + int rc = 0; + size_t src0FileSize = tc.src0Rows * tc.src0Cols * tc.elemSize; + size_t src1FileSize = tc.src1Rows * tc.src1Cols * tc.elemSize; + const size_t dstFileSize = tc.dstRows * tc.dstCols * tc.elemSize; + + std::printf("[INFO] === case: %s (src0=%zux%zu, src1=%zux%zu, dst=%zux%zu) ===\n", + tc.name, tc.src0Rows, tc.src0Cols, tc.src1Rows, tc.src1Cols, tc.dstRows, tc.dstCols); + + std::string caseDir = std::string("./") + tc.name; + + void *src0Host = nullptr, *src1Host = nullptr, *dstHost = nullptr; + void *src0Device = nullptr, *src1Device = nullptr, *dstDevice = nullptr; + + aclrtMallocHost((void **)(&src0Host), src0FileSize); + aclrtMallocHost((void **)(&src1Host), src1FileSize); + aclrtMallocHost((void **)(&dstHost), dstFileSize); + aclrtMalloc((void **)&src0Device, src0FileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc((void **)&src1Device, src1FileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc((void **)&dstDevice, dstFileSize, ACL_MEM_MALLOC_HUGE_FIRST); + + if (!ReadFile((caseDir + "/input1.bin").c_str(), src0FileSize, src0Host, src0FileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input1.bin\n", caseDir.c_str()); + rc = 1; + } + if (rc == 0 && !ReadFile((caseDir + "/input2.bin").c_str(), src1FileSize, src1Host, src1FileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input2.bin\n", caseDir.c_str()); + rc = 1; + } + + if (rc == 0) { + aclrtMemcpy(src0Device, src0FileSize, src0Host, src0FileSize, ACL_MEMCPY_HOST_TO_DEVICE); + aclrtMemcpy(src1Device, src1FileSize, src1Host, src1FileSize, ACL_MEMCPY_HOST_TO_DEVICE); + tc.launch(src0Device, src1Device, dstDevice, stream); + aclrtSynchronizeStream(stream); + aclrtMemcpy(dstHost, dstFileSize, dstDevice, dstFileSize, ACL_MEMCPY_DEVICE_TO_HOST); + } + + if (rc == 0 && !WriteFile((caseDir + "/output.bin").c_str(), dstHost, dstFileSize)) { + std::fprintf(stderr, "[ERROR] failed to write %s/output.bin\n", caseDir.c_str()); + rc = 1; + } + + if (src0Device) aclrtFree(src0Device); + if (src1Device) aclrtFree(src1Device); + if (dstDevice) aclrtFree(dstDevice); + if (src0Host) aclrtFreeHost(src0Host); + if (src1Host) aclrtFreeHost(src1Host); + if (dstHost) aclrtFreeHost(dstHost); + + if (rc == 0) std::printf("[INFO] case %s done\n", tc.name); + return rc; +} + +int main(int argc, char *argv[]) { + const char *caseFilter = (argc > 1) ? argv[1] : nullptr; + int rc = 0, deviceId = 0; + aclrtStream stream = nullptr; + + aclInit(nullptr); + if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) deviceId = std::atoi(envDevice); + aclrtSetDevice(deviceId); + aclrtCreateStream(&stream); + + for (size_t i = 0; i < kNumCases; ++i) { + if (caseFilter && std::strcmp(kCases[i].name, caseFilter) != 0) continue; + if (RunCase(kCases[i], deviceId, stream) != 0) { rc = 1; break; } + } + + if (stream) aclrtDestroyStream(stream); + aclrtResetDevice(deviceId); + aclFinalize(); + return rc; +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandsub/trowexpandsub.pto b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandsub/trowexpandsub.pto new file mode 100644 index 000000000..5b9078fa2 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowexpandsub/trowexpandsub.pto @@ -0,0 +1,72 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You can not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// TileLang ST kernels for pto.trowexpandsub: row-wise broadcast subtraction. + +module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { + // f32_8x128: dstRow=8, dstCol=128, src1Row=8, src1Col=1, src0eqdst=true + func.func @TROWEXPANDSUB_f32_24x32(%src0_ptr: !pto.ptr, %src1_ptr: !pto.ptr, %dst_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c8 = arith.constant 8 : index + %c24 = arith.constant 24 : index + %c32 = arith.constant 32 : index + %c192 = arith.constant 192 : index + %c768 = arith.constant 768 : index + + %src0_view = pto.make_tensor_view %src0_ptr, shape = [%c1, %c1, %c1, %c24, %c32], strides = [%c768, %c768, %c768, %c32, %c1] : !pto.tensor_view<1x1x1x24x32xf32> + %src1_view = pto.make_tensor_view %src1_ptr, shape = [%c1, %c1, %c1, %c24, %c8], strides = [%c192, %c192, %c192, %c8, %c1] : !pto.tensor_view<1x1x1x24x8xf32> + %dst_view = pto.make_tensor_view %dst_ptr, shape = [%c1, %c1, %c1, %c24, %c32], strides = [%c768, %c768, %c768, %c32, %c1] : !pto.tensor_view<1x1x1x24x32xf32> + + %src0_part = pto.partition_view %src0_view, offsets = [%c0, %c0, %c0, %c0, %c0], sizes = [%c1, %c1, %c1, %c24, %c32] : !pto.tensor_view<1x1x1x24x32xf32> -> !pto.partition_tensor_view<1x1x1x24x32xf32> + %src1_part = pto.partition_view %src1_view, offsets = [%c0, %c0, %c0, %c0, %c0], sizes = [%c1, %c1, %c1, %c24, %c8] : !pto.tensor_view<1x1x1x24x8xf32> -> !pto.partition_tensor_view<1x1x1x24x8xf32> + %dst_part = pto.partition_view %dst_view, offsets = [%c0, %c0, %c0, %c0, %c0], sizes = [%c1, %c1, %c1, %c24, %c32] : !pto.tensor_view<1x1x1x24x32xf32> -> !pto.partition_tensor_view<1x1x1x24x32xf32> + + %src0 = pto.alloc_tile : !pto.tile_buf + %src1 = pto.alloc_tile : !pto.tile_buf + %dst = pto.alloc_tile : !pto.tile_buf + + pto.tload ins(%src0_part : !pto.partition_tensor_view<1x1x1x24x32xf32>) outs(%src0 : !pto.tile_buf) + pto.tload ins(%src1_part : !pto.partition_tensor_view<1x1x1x24x8xf32>) outs(%src1 : !pto.tile_buf) + pto.trowexpandsub ins(%src0, %src1 : !pto.tile_buf, !pto.tile_buf) outs(%dst : !pto.tile_buf) {src1Col = 1 : i64, src0eqdst = true, highPrecision = false} + pto.tstore ins(%dst : !pto.tile_buf) outs(%dst_part : !pto.partition_tensor_view<1x1x1x24x32xf32>) + return + } + + // f16_16x256: dstRow=16, dstCol=256, src1Row=16, src1Col=1, src0eqdst=true + + func.func @TROWEXPANDSUB_i32_16x32(%src0_ptr: !pto.ptr, %src1_ptr: !pto.ptr, %dst_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c8 = arith.constant 8 : index + %c16 = arith.constant 16 : index + %c32 = arith.constant 32 : index + %c128 = arith.constant 128 : index + %c512 = arith.constant 512 : index + + %src0_view = pto.make_tensor_view %src0_ptr, shape = [%c1, %c1, %c1, %c16, %c32], strides = [%c512, %c512, %c512, %c32, %c1] : !pto.tensor_view<1x1x1x16x32xi32> + %src1_view = pto.make_tensor_view %src1_ptr, shape = [%c1, %c1, %c1, %c16, %c8], strides = [%c128, %c128, %c128, %c8, %c1] : !pto.tensor_view<1x1x1x16x8xi32> + %dst_view = pto.make_tensor_view %dst_ptr, shape = [%c1, %c1, %c1, %c16, %c32], strides = [%c512, %c512, %c512, %c32, %c1] : !pto.tensor_view<1x1x1x16x32xi32> + + %src0_part = pto.partition_view %src0_view, offsets = [%c0, %c0, %c0, %c0, %c0], sizes = [%c1, %c1, %c1, %c16, %c32] : !pto.tensor_view<1x1x1x16x32xi32> -> !pto.partition_tensor_view<1x1x1x16x32xi32> + %src1_part = pto.partition_view %src1_view, offsets = [%c0, %c0, %c0, %c0, %c0], sizes = [%c1, %c1, %c1, %c16, %c8] : !pto.tensor_view<1x1x1x16x8xi32> -> !pto.partition_tensor_view<1x1x1x16x8xi32> + %dst_part = pto.partition_view %dst_view, offsets = [%c0, %c0, %c0, %c0, %c0], sizes = [%c1, %c1, %c1, %c16, %c32] : !pto.tensor_view<1x1x1x16x32xi32> -> !pto.partition_tensor_view<1x1x1x16x32xi32> + + %src0 = pto.alloc_tile : !pto.tile_buf + %src1 = pto.alloc_tile : !pto.tile_buf + %dst = pto.alloc_tile : !pto.tile_buf + + pto.tload ins(%src0_part : !pto.partition_tensor_view<1x1x1x16x32xi32>) outs(%src0 : !pto.tile_buf) + pto.tload ins(%src1_part : !pto.partition_tensor_view<1x1x1x16x8xi32>) outs(%src1 : !pto.tile_buf) + pto.trowexpandsub ins(%src0, %src1 : !pto.tile_buf, !pto.tile_buf) outs(%dst : !pto.tile_buf) {src1Col = 1 : i64, src0eqdst = true, highPrecision = false} + pto.tstore ins(%dst : !pto.tile_buf) outs(%dst_part : !pto.partition_tensor_view<1x1x1x16x32xi32>) + return + } + + // i16_16x64: dstRow=16, dstCol=64, src1Row=16, src1Col=1, src0eqdst=true +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowmax/CMakeLists.txt b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowmax/CMakeLists.txt new file mode 100644 index 000000000..62291cfb6 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowmax/CMakeLists.txt @@ -0,0 +1,9 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +pto_tilelang_vec_st(trowmax) diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowmax/cases.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowmax/cases.py new file mode 100644 index 000000000..e55d36532 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowmax/cases.py @@ -0,0 +1,231 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +"""Single source of truth for trowmax ST test cases. + +Aligned with pto-isa tests/npu/a2a3/src/st/testcase/trowmax (28 cases). +""" + +import numpy as np + +CASES = [ + # f32 cases (case1-case5 from pto-isa) + { + "name": "f32_127x64_valid127x63", + "dtype": np.float32, + "shape": (127, 64), + "valid_shape": (127, 63), + "eps": 1e-5, + }, + { + "name": "f32_63x64", + "dtype": np.float32, + "shape": (63, 64), + "valid_shape": (63, 64), + "eps": 1e-5, + }, + { + "name": "f32_31x128_valid31x127", + "dtype": np.float32, + "shape": (31, 128), + "valid_shape": (31, 127), + "eps": 1e-5, + }, + { + "name": "f32_15x192", + "dtype": np.float32, + "shape": (15, 192), + "valid_shape": (15, 192), + "eps": 1e-5, + }, + { + "name": "f32_7x448_valid7x447", + "dtype": np.float32, + "shape": (7, 448), + "valid_shape": (7, 447), + "eps": 1e-5, + }, + # f16 case (case6 from pto-isa) + { + "name": "f16_256x16_valid256x15", + "dtype": np.float16, + "shape": (256, 16), + "valid_shape": (256, 15), + "eps": 1e-2, + }, + # f32 more cases (case7-case14 from pto-isa) + { + "name": "f32_30x216", + "dtype": np.float32, + "shape": (30, 216), + "valid_shape": (30, 216), + "eps": 1e-5, + }, + { + "name": "f32_30x216_valid30x24", + "dtype": np.float32, + "shape": (30, 216), + "valid_shape": (30, 24), + "eps": 1e-5, + }, + { + "name": "f32_30x216_valid11x216", + "dtype": np.float32, + "shape": (30, 216), + "valid_shape": (11, 216), + "eps": 1e-5, + }, + { + "name": "f32_30x216_valid11x24", + "dtype": np.float32, + "shape": (30, 216), + "valid_shape": (11, 24), + "eps": 1e-5, + }, + { + "name": "f32_238x40", + "dtype": np.float32, + "shape": (238, 40), + "valid_shape": (238, 40), + "eps": 1e-5, + }, + { + "name": "f32_238x40_valid238x16", + "dtype": np.float32, + "shape": (238, 40), + "valid_shape": (238, 16), + "eps": 1e-5, + }, + { + "name": "f32_238x40_valid121x40", + "dtype": np.float32, + "shape": (238, 40), + "valid_shape": (121, 40), + "eps": 1e-5, + }, + { + "name": "f32_238x40_valid121x16", + "dtype": np.float32, + "shape": (238, 40), + "valid_shape": (121, 16), + "eps": 1e-5, + }, + # f32 DN dst cases (case15-case18 from pto-isa) + { + "name": "f32_64x128", + "dtype": np.float32, + "shape": (64, 128), + "valid_shape": (64, 128), + "eps": 1e-5, + }, + { + "name": "f32_32x256", + "dtype": np.float32, + "shape": (32, 256), + "valid_shape": (32, 256), + "eps": 1e-5, + }, + { + "name": "f32_16x512", + "dtype": np.float32, + "shape": (16, 512), + "valid_shape": (16, 512), + "eps": 1e-5, + }, + { + "name": "f32_8x1024", + "dtype": np.float32, + "shape": (8, 1024), + "valid_shape": (8, 1024), + "eps": 1e-5, + }, + + # int32 cases (case19-case23 from pto-isa) + { + "name": "i32_127x64_valid127x63", + "dtype": np.int32, + "shape": (127, 64), + "valid_shape": (127, 63), + "eps": 0, + }, + { + "name": "i32_63x64", + "dtype": np.int32, + "shape": (63, 64), + "valid_shape": (63, 64), + "eps": 0, + }, + { + "name": "i32_31x128_valid31x127", + "dtype": np.int32, + "shape": (31, 128), + "valid_shape": (31, 127), + "eps": 0, + }, + { + "name": "i32_15x192", + "dtype": np.int32, + "shape": (15, 192), + "valid_shape": (15, 192), + "eps": 0, + }, + { + "name": "i32_7x448_valid7x447", + "dtype": np.int32, + "shape": (7, 448), + "valid_shape": (7, 447), + "eps": 0, + }, + + # int16 cases (case24-case28 from pto-isa) + { + "name": "i16_128x64", + "dtype": np.int16, + "shape": (128, 64), + "valid_shape": (128, 64), + "eps": 0, + }, + { + "name": "i16_64x64", + "dtype": np.int16, + "shape": (64, 64), + "valid_shape": (64, 64), + "eps": 0, + }, + { + "name": "i16_32x128", + "dtype": np.int16, + "shape": (32, 128), + "valid_shape": (32, 128), + "eps": 0, + }, + { + "name": "i16_16x192", + "dtype": np.int16, + "shape": (16, 192), + "valid_shape": (16, 192), + "eps": 0, + }, + { + "name": "i16_8x448", + "dtype": np.int16, + "shape": (8, 448), + "valid_shape": (8, 448), + "eps": 0, + }, +] + +_SMOKE_CASE_NAMES = ['f32_15x192', 'i32_7x448_valid7x447'] +_SMOKE_CASE_NAME_SET = set(_SMOKE_CASE_NAMES) +_missing = [name for name in _SMOKE_CASE_NAMES if name not in {case["name"] for case in CASES}] +if _missing: + raise RuntimeError("unknown smoke case(s): " + ", ".join(_missing)) +CASES = [case for case in CASES if case["name"] in _SMOKE_CASE_NAME_SET] diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowmax/compare.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowmax/compare.py new file mode 100644 index 000000000..12d4207bd --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowmax/compare.py @@ -0,0 +1,49 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import os +import sys +import numpy as np + +from cases import CASES +from st_common import result_cmp, style_fail, style_pass, validate_cases + + +def main(): + validate_cases(CASES) + case_filter = sys.argv[1] if len(sys.argv) > 1 else None + + all_passed = True + for case in CASES: + if case_filter is not None and case["name"] != case_filter: + continue + + case_dir = case["name"] + vr, vc = case["valid_shape"] + out_shape = (vr,) + + golden = np.fromfile(os.path.join(case_dir, "golden.bin"), dtype=case["dtype"], count=np.prod(out_shape)).reshape(out_shape) + output = np.fromfile(os.path.join(case_dir, "output.bin"), dtype=case["dtype"], count=np.prod(out_shape)).reshape(out_shape) + + ok = result_cmp(golden, output, case["eps"]) + if ok: + print(style_pass(f"[INFO] {case['name']}: compare passed")) + else: + print(style_fail(f"[ERROR] {case['name']}: compare failed")) + all_passed = False + + if not all_passed: + sys.exit(2) + print(style_pass("[INFO] all cases passed")) + + +if __name__ == "__main__": + main() diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowmax/gen_data.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowmax/gen_data.py new file mode 100644 index 000000000..97495c982 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowmax/gen_data.py @@ -0,0 +1,41 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import numpy as np +from cases import CASES +from st_common import validate_cases, setup_case_rng, save_case_data + +validate_cases(CASES) + +for case in CASES: + setup_case_rng(case) + + dtype = case["dtype"] + shape = case["shape"] + valid_shape = case["valid_shape"] + + if np.issubdtype(dtype, np.integer): + if dtype == np.int32: + input1 = np.random.randint(low=-100, high=100, size=shape).astype(dtype) + else: + input1 = np.random.randint(low=-50, high=50, size=shape).astype(dtype) + else: + input1 = np.random.uniform(low=-16, high=16, size=shape).astype(dtype) + + out_shape = (valid_shape[0],) + golden = np.zeros(out_shape, dtype=dtype) + vr, vc = valid_shape + for i in range(vr): + golden[i] = np.max(input1[i, :vc]) + + golden = golden.astype(dtype, copy=False) + save_case_data(case["name"], {"input1": input1, "golden": golden}) + print(f"[INFO] gen_data: {case['name']} shape={shape} valid_shape={valid_shape} dtype={dtype.__name__}") diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowmax/launch.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowmax/launch.cpp new file mode 100644 index 000000000..e8ea5ae61 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowmax/launch.cpp @@ -0,0 +1,26 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +#include + +#ifndef AICORE +#define AICORE [aicore] +#endif + +extern "C" __global__ AICORE void TROWMAX_f32_15x192(__gm__ float *src, __gm__ float *dst); +extern "C" __global__ AICORE void TROWMAX_i32_7x448_valid7x447(__gm__ int32_t *src, __gm__ int32_t *dst); + +void LaunchTROWMAX_f32_15x192(float *src, float *dst, void *stream) { + TROWMAX_f32_15x192<<<1, nullptr, stream>>>((__gm__ float *)src, (__gm__ float *)dst); +} + + + +void LaunchTROWMAX_i32_7x448_valid7x447(int32_t *src, int32_t *dst, void *stream) { + TROWMAX_i32_7x448_valid7x447<<<1, nullptr, stream>>>((__gm__ int32_t *)src, (__gm__ int32_t *)dst); +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowmax/main.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowmax/main.cpp new file mode 100644 index 000000000..e98780082 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowmax/main.cpp @@ -0,0 +1,163 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// Host driver for TileLang trowmax ST — case-table driven. + +#include "acl/acl.h" +#include "test_common.h" +#include +#include +#include +#include +#include +#include + +using namespace PtoTestCommon; + +// Kernel launch wrappers (defined in launch.cpp) +void LaunchTROWMAX_f32_63x64(float *src, float *dst, void *stream); +void LaunchTROWMAX_f32_15x192(float *src, float *dst, void *stream); +void LaunchTROWMAX_f16_256x16_valid256x15(uint16_t *src, uint16_t *dst, void *stream); +void LaunchTROWMAX_f32_30x216_valid30x24(float *src, float *dst, void *stream); +void LaunchTROWMAX_f32_30x216_valid11x24(float *src, float *dst, void *stream); +void LaunchTROWMAX_f32_238x40_valid238x16(float *src, float *dst, void *stream); +void LaunchTROWMAX_f32_238x40_valid121x16(float *src, float *dst, void *stream); +void LaunchTROWMAX_f32_32x256(float *src, float *dst, void *stream); +void LaunchTROWMAX_f32_8x1024(float *src, float *dst, void *stream); +void LaunchTROWMAX_i32_63x64(int32_t *src, int32_t *dst, void *stream); +void LaunchTROWMAX_i32_15x192(int32_t *src, int32_t *dst, void *stream); +void LaunchTROWMAX_i32_7x448_valid7x447(int32_t *src, int32_t *dst, void *stream); +void LaunchTROWMAX_i16_128x64(int16_t *src, int16_t *dst, void *stream); +void LaunchTROWMAX_i16_32x128(int16_t *src, int16_t *dst, void *stream); +void LaunchTROWMAX_i16_8x448(int16_t *src, int16_t *dst, void *stream); + +using LaunchFnF32 = void (*)(float *, float *, void *); +using LaunchFnF16 = void (*)(uint16_t *, uint16_t *, void *); +using LaunchFnI32 = void (*)(int32_t *, int32_t *, void *); +using LaunchFnI16 = void (*)(int16_t *, int16_t *, void *); + +enum class DType { F32, F16, I32, I16 }; + +struct TestCase { + const char *name; + DType dtype; + union { + LaunchFnF32 launchF32; + LaunchFnF16 launchF16; + LaunchFnI32 launchI32; + LaunchFnI16 launchI16; + }; + size_t rows; // allocated tile rows + size_t cols; // allocated tile cols + size_t validRows; // effective computation rows (<= rows) + size_t validCols; // effective computation cols (<= cols) + size_t elemSize; // bytes per element +}; + +static const TestCase kCases[] = { + // f32 cases +{"f32_15x192", DType::F32, .launchF32 = LaunchTROWMAX_f32_15x192, 15, 192, 15, 192, 4}, +{"i32_7x448_valid7x447", DType::I32, .launchI32 = LaunchTROWMAX_i32_7x448_valid7x447, 7, 448, 7, 447, 4}, +}; +static constexpr size_t kNumCases = sizeof(kCases) / sizeof(kCases[0]); + +static int RunCase(const TestCase &tc, int deviceId, aclrtStream stream) { + int rc = 0; + const size_t srcElemCount = tc.rows * tc.cols; + const size_t srcFileSize = srcElemCount * tc.elemSize; + const size_t dstElemCount = tc.validRows * 1; + const size_t dstFileSize = dstElemCount * tc.elemSize; + + std::printf("[INFO] === case: %s (shape=%zux%zu, valid=%zux%zu) ===\n", + tc.name, tc.rows, tc.cols, tc.validRows, tc.validCols); + + // Per-case data directory + std::string caseDir = std::string("./") + tc.name; + size_t src0FileSize = srcFileSize; + + void *src0Host = nullptr, *dstHost = nullptr; + void *src0Device = nullptr, *dstDevice = nullptr; + + aclrtMallocHost(&src0Host, srcFileSize); + aclrtMallocHost(&dstHost, dstFileSize); + + aclrtMalloc(&src0Device, srcFileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc(&dstDevice, dstFileSize, ACL_MEM_MALLOC_HUGE_FIRST); + + if (!ReadFile((caseDir + "/input1.bin").c_str(), src0FileSize, src0Host, srcFileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input1.bin\n", caseDir.c_str()); + rc = 1; + } + + if (rc == 0) { + aclrtMemcpy(src0Device, srcFileSize, src0Host, srcFileSize, ACL_MEMCPY_HOST_TO_DEVICE); + + switch (tc.dtype) { + case DType::F32: tc.launchF32((float *)src0Device, (float *)dstDevice, stream); break; + case DType::F16: tc.launchF16((uint16_t *)src0Device, (uint16_t *)dstDevice, stream); break; + case DType::I32: tc.launchI32((int32_t *)src0Device, (int32_t *)dstDevice, stream); break; + case DType::I16: tc.launchI16((int16_t *)src0Device, (int16_t *)dstDevice, stream); break; + } + + aclrtSynchronizeStream(stream); + aclrtMemcpy(dstHost, dstFileSize, dstDevice, dstFileSize, ACL_MEMCPY_DEVICE_TO_HOST); + } + + if (rc == 0 && !WriteFile((caseDir + "/output.bin").c_str(), dstHost, dstFileSize)) { + std::fprintf(stderr, "[ERROR] failed to write %s/output.bin\n", caseDir.c_str()); + rc = 1; + } + + if (src0Device != nullptr) + aclrtFree(src0Device); + if (dstDevice != nullptr) + aclrtFree(dstDevice); + if (src0Host != nullptr) + aclrtFreeHost(src0Host); + if (dstHost != nullptr) + aclrtFreeHost(dstHost); + + if (rc == 0) + std::printf("[INFO] case %s done\n", tc.name); + return rc; +} + +int main(int argc, char *argv[]) { + // Optional case filter: ./trowmax [case_name] + const char *caseFilter = (argc > 1) ? argv[1] : nullptr; + + int rc = 0; + int deviceId = 0; + aclrtStream stream = nullptr; + + aclInit(nullptr); + if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { + deviceId = std::atoi(envDevice); + } + aclrtSetDevice(deviceId); + aclrtCreateStream(&stream); + + for (size_t i = 0; i < kNumCases; ++i) { + if (caseFilter != nullptr && std::strcmp(kCases[i].name, caseFilter) != 0) { + continue; + } + int ret = RunCase(kCases[i], deviceId, stream); + if (ret != 0) { + std::fprintf(stderr, "[ERROR] case %s failed\n", kCases[i].name); + rc = 1; + break; + } + } + + if (stream != nullptr) + aclrtDestroyStream(stream); + aclrtResetDevice(deviceId); + aclFinalize(); + + return rc; +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowmax/trowmax.pto b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowmax/trowmax.pto new file mode 100644 index 000000000..dc9ecabc8 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowmax/trowmax.pto @@ -0,0 +1,112 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// TileLang ST kernels for pto.trowmax: tload(src) + trowmax(src, tmp)->dst + tstore(dst). + +module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { + + // Case 0: f32 127x64 (valid=127x63) + func.func @TROWMAX_f32_15x192(%src_ptr: !pto.ptr, %dst_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c8 = arith.constant 8 : index + %c15 = arith.constant 15 : index + %c192 = arith.constant 192 : index + %c2880 = arith.constant 2880 : index + + %src_view = pto.make_tensor_view %src_ptr, + shape = [%c1, %c1, %c1, %c15, %c192], + strides = [%c2880, %c2880, %c2880, %c192, %c1] + : !pto.tensor_view<1x1x1x15x192xf32> + %dst_view = pto.make_tensor_view %dst_ptr, + shape = [%c1, %c1, %c1, %c15, %c1], + strides = [%c15, %c15, %c15, %c1, %c1] + : !pto.tensor_view<1x1x1x15x1xf32> + + %src_part = pto.partition_view %src_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c15, %c192] + : !pto.tensor_view<1x1x1x15x192xf32> -> !pto.partition_tensor_view<1x1x1x15x192xf32> + %dst_part = pto.partition_view %dst_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c15, %c1] + : !pto.tensor_view<1x1x1x15x1xf32> -> !pto.partition_tensor_view<1x1x1x15x1xf32> + + %src = pto.alloc_tile + : !pto.tile_buf + %tmp = pto.alloc_tile + : !pto.tile_buf + %dst = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%src_part : !pto.partition_tensor_view<1x1x1x15x192xf32>) + outs(%src : !pto.tile_buf) + + pto.trowmax ins(%src, %tmp : !pto.tile_buf, + !pto.tile_buf) + outs(%dst : !pto.tile_buf) + + pto.tstore ins(%dst : !pto.tile_buf) + outs(%dst_part : !pto.partition_tensor_view<1x1x1x15x1xf32>) + return + } + + // Case 4: f32 7x448 (valid=7x447) + + func.func @TROWMAX_i32_7x448_valid7x447(%src_ptr: !pto.ptr, %dst_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c8 = arith.constant 8 : index + %c7 = arith.constant 7 : index + %c447 = arith.constant 447 : index + %c448 = arith.constant 448 : index + %c3136 = arith.constant 3136 : index + + %src_view = pto.make_tensor_view %src_ptr, + shape = [%c1, %c1, %c1, %c7, %c448], + strides = [%c3136, %c3136, %c3136, %c448, %c1] + : !pto.tensor_view<1x1x1x7x448xi32> + %dst_view = pto.make_tensor_view %dst_ptr, + shape = [%c1, %c1, %c1, %c7, %c1], + strides = [%c7, %c7, %c7, %c1, %c1] + : !pto.tensor_view<1x1x1x7x1xi32> + + %src_part = pto.partition_view %src_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c7, %c447] + : !pto.tensor_view<1x1x1x7x448xi32> -> !pto.partition_tensor_view<1x1x1x7x447xi32> + %dst_part = pto.partition_view %dst_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c7, %c1] + : !pto.tensor_view<1x1x1x7x1xi32> -> !pto.partition_tensor_view<1x1x1x7x1xi32> + + %src = pto.alloc_tile + : !pto.tile_buf + %tmp = pto.alloc_tile + : !pto.tile_buf + %dst = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%src_part : !pto.partition_tensor_view<1x1x1x7x447xi32>) + outs(%src : !pto.tile_buf) + + pto.trowmax ins(%src, %tmp : !pto.tile_buf, + !pto.tile_buf) + outs(%dst : !pto.tile_buf) + + pto.tstore ins(%dst : !pto.tile_buf) + outs(%dst_part : !pto.partition_tensor_view<1x1x1x7x1xi32>) + return + } + + // ======================================================================== + // int16 cases (case24-case28) + // ======================================================================== + + // case24: i16 128x64 valid=128x64 +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowmin/CMakeLists.txt b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowmin/CMakeLists.txt new file mode 100644 index 000000000..e88611a82 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowmin/CMakeLists.txt @@ -0,0 +1,9 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +pto_tilelang_vec_st(trowmin) diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowmin/cases.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowmin/cases.py new file mode 100644 index 000000000..5bb718723 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowmin/cases.py @@ -0,0 +1,231 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +"""Single source of truth for trowmin ST test cases. + +Aligned with pto-isa tests/npu/a2a3/src/st/testcase/trowmin (28 cases). +""" + +import numpy as np + +CASES = [ + # f32 cases (case1-case5 from pto-isa) + { + "name": "f32_127x64_valid127x63", + "dtype": np.float32, + "shape": (127, 64), + "valid_shape": (127, 63), + "eps": 1e-5, + }, + { + "name": "f32_63x64", + "dtype": np.float32, + "shape": (63, 64), + "valid_shape": (63, 64), + "eps": 1e-5, + }, + { + "name": "f32_31x128_valid31x127", + "dtype": np.float32, + "shape": (31, 128), + "valid_shape": (31, 127), + "eps": 1e-5, + }, + { + "name": "f32_15x192", + "dtype": np.float32, + "shape": (15, 192), + "valid_shape": (15, 192), + "eps": 1e-5, + }, + { + "name": "f32_7x448_valid7x447", + "dtype": np.float32, + "shape": (7, 448), + "valid_shape": (7, 447), + "eps": 1e-5, + }, + # f16 case (case6 from pto-isa) + { + "name": "f16_256x16_valid256x15", + "dtype": np.float16, + "shape": (256, 16), + "valid_shape": (256, 15), + "eps": 1e-2, + }, + # f32 more cases (case7-case14 from pto-isa) + { + "name": "f32_30x216", + "dtype": np.float32, + "shape": (30, 216), + "valid_shape": (30, 216), + "eps": 1e-5, + }, + { + "name": "f32_30x216_valid30x24", + "dtype": np.float32, + "shape": (30, 216), + "valid_shape": (30, 24), + "eps": 1e-5, + }, + { + "name": "f32_30x216_valid11x216", + "dtype": np.float32, + "shape": (30, 216), + "valid_shape": (11, 216), + "eps": 1e-5, + }, + { + "name": "f32_30x216_valid11x24", + "dtype": np.float32, + "shape": (30, 216), + "valid_shape": (11, 24), + "eps": 1e-5, + }, + { + "name": "f32_238x40", + "dtype": np.float32, + "shape": (238, 40), + "valid_shape": (238, 40), + "eps": 1e-5, + }, + { + "name": "f32_238x40_valid238x16", + "dtype": np.float32, + "shape": (238, 40), + "valid_shape": (238, 16), + "eps": 1e-5, + }, + { + "name": "f32_238x40_valid121x40", + "dtype": np.float32, + "shape": (238, 40), + "valid_shape": (121, 40), + "eps": 1e-5, + }, + { + "name": "f32_238x40_valid121x16", + "dtype": np.float32, + "shape": (238, 40), + "valid_shape": (121, 16), + "eps": 1e-5, + }, + # f32 DN dst cases (case15-case18 from pto-isa) + { + "name": "f32_64x128", + "dtype": np.float32, + "shape": (64, 128), + "valid_shape": (64, 128), + "eps": 1e-5, + }, + { + "name": "f32_32x256", + "dtype": np.float32, + "shape": (32, 256), + "valid_shape": (32, 256), + "eps": 1e-5, + }, + { + "name": "f32_16x512", + "dtype": np.float32, + "shape": (16, 512), + "valid_shape": (16, 512), + "eps": 1e-5, + }, + { + "name": "f32_8x1024", + "dtype": np.float32, + "shape": (8, 1024), + "valid_shape": (8, 1024), + "eps": 1e-5, + }, + + # int32 cases (case19-case23 from pto-isa) + { + "name": "i32_127x64_valid127x63", + "dtype": np.int32, + "shape": (127, 64), + "valid_shape": (127, 63), + "eps": 0, + }, + { + "name": "i32_63x64", + "dtype": np.int32, + "shape": (63, 64), + "valid_shape": (63, 64), + "eps": 0, + }, + { + "name": "i32_31x128_valid31x127", + "dtype": np.int32, + "shape": (31, 128), + "valid_shape": (31, 127), + "eps": 0, + }, + { + "name": "i32_15x192", + "dtype": np.int32, + "shape": (15, 192), + "valid_shape": (15, 192), + "eps": 0, + }, + { + "name": "i32_7x448_valid7x447", + "dtype": np.int32, + "shape": (7, 448), + "valid_shape": (7, 447), + "eps": 0, + }, + + # int16 cases (case24-case28 from pto-isa) + { + "name": "i16_128x64", + "dtype": np.int16, + "shape": (128, 64), + "valid_shape": (128, 64), + "eps": 0, + }, + { + "name": "i16_64x64", + "dtype": np.int16, + "shape": (64, 64), + "valid_shape": (64, 64), + "eps": 0, + }, + { + "name": "i16_32x128", + "dtype": np.int16, + "shape": (32, 128), + "valid_shape": (32, 128), + "eps": 0, + }, + { + "name": "i16_16x192", + "dtype": np.int16, + "shape": (16, 192), + "valid_shape": (16, 192), + "eps": 0, + }, + { + "name": "i16_8x448", + "dtype": np.int16, + "shape": (8, 448), + "valid_shape": (8, 448), + "eps": 0, + }, +] + +_SMOKE_CASE_NAMES = ['f32_15x192', 'i32_7x448_valid7x447'] +_SMOKE_CASE_NAME_SET = set(_SMOKE_CASE_NAMES) +_missing = [name for name in _SMOKE_CASE_NAMES if name not in {case["name"] for case in CASES}] +if _missing: + raise RuntimeError("unknown smoke case(s): " + ", ".join(_missing)) +CASES = [case for case in CASES if case["name"] in _SMOKE_CASE_NAME_SET] diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowmin/compare.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowmin/compare.py new file mode 100644 index 000000000..12d4207bd --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowmin/compare.py @@ -0,0 +1,49 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import os +import sys +import numpy as np + +from cases import CASES +from st_common import result_cmp, style_fail, style_pass, validate_cases + + +def main(): + validate_cases(CASES) + case_filter = sys.argv[1] if len(sys.argv) > 1 else None + + all_passed = True + for case in CASES: + if case_filter is not None and case["name"] != case_filter: + continue + + case_dir = case["name"] + vr, vc = case["valid_shape"] + out_shape = (vr,) + + golden = np.fromfile(os.path.join(case_dir, "golden.bin"), dtype=case["dtype"], count=np.prod(out_shape)).reshape(out_shape) + output = np.fromfile(os.path.join(case_dir, "output.bin"), dtype=case["dtype"], count=np.prod(out_shape)).reshape(out_shape) + + ok = result_cmp(golden, output, case["eps"]) + if ok: + print(style_pass(f"[INFO] {case['name']}: compare passed")) + else: + print(style_fail(f"[ERROR] {case['name']}: compare failed")) + all_passed = False + + if not all_passed: + sys.exit(2) + print(style_pass("[INFO] all cases passed")) + + +if __name__ == "__main__": + main() diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowmin/gen_data.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowmin/gen_data.py new file mode 100644 index 000000000..cf1bed8ac --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowmin/gen_data.py @@ -0,0 +1,41 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import numpy as np +from cases import CASES +from st_common import validate_cases, setup_case_rng, save_case_data + +validate_cases(CASES) + +for case in CASES: + setup_case_rng(case) + + dtype = case["dtype"] + shape = case["shape"] + valid_shape = case["valid_shape"] + + if np.issubdtype(dtype, np.integer): + if dtype == np.int32: + input1 = np.random.randint(low=-100, high=100, size=shape).astype(dtype) + else: + input1 = np.random.randint(low=-50, high=50, size=shape).astype(dtype) + else: + input1 = np.random.uniform(low=-16, high=16, size=shape).astype(dtype) + + out_shape = (valid_shape[0],) + golden = np.zeros(out_shape, dtype=dtype) + vr, vc = valid_shape + for i in range(vr): + golden[i] = np.min(input1[i, :vc]) + + golden = golden.astype(dtype, copy=False) + save_case_data(case["name"], {"input1": input1, "golden": golden}) + print(f"[INFO] gen_data: {case['name']} shape={shape} valid_shape={valid_shape} dtype={dtype.__name__}") diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowmin/launch.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowmin/launch.cpp new file mode 100644 index 000000000..258d321a0 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowmin/launch.cpp @@ -0,0 +1,26 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +#include + +#ifndef AICORE +#define AICORE [aicore] +#endif + +extern "C" __global__ AICORE void TROWMIN_f32_15x192(__gm__ float *src, __gm__ float *dst); +extern "C" __global__ AICORE void TROWMIN_i32_7x448_valid7x447(__gm__ int32_t *src, __gm__ int32_t *dst); + +void LaunchTROWMIN_f32_15x192(float *src, float *dst, void *stream) { + TROWMIN_f32_15x192<<<1, nullptr, stream>>>((__gm__ float *)src, (__gm__ float *)dst); +} + + + +void LaunchTROWMIN_i32_7x448_valid7x447(int32_t *src, int32_t *dst, void *stream) { + TROWMIN_i32_7x448_valid7x447<<<1, nullptr, stream>>>((__gm__ int32_t *)src, (__gm__ int32_t *)dst); +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowmin/main.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowmin/main.cpp new file mode 100644 index 000000000..26e052683 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowmin/main.cpp @@ -0,0 +1,163 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// Host driver for TileLang trowmin ST — case-table driven. + +#include "acl/acl.h" +#include "test_common.h" +#include +#include +#include +#include +#include +#include + +using namespace PtoTestCommon; + +// Kernel launch wrappers (defined in launch.cpp) +void LaunchTROWMIN_f32_63x64(float *src, float *dst, void *stream); +void LaunchTROWMIN_f32_15x192(float *src, float *dst, void *stream); +void LaunchTROWMIN_f16_256x16_valid256x15(uint16_t *src, uint16_t *dst, void *stream); +void LaunchTROWMIN_f32_30x216_valid30x24(float *src, float *dst, void *stream); +void LaunchTROWMIN_f32_30x216_valid11x24(float *src, float *dst, void *stream); +void LaunchTROWMIN_f32_238x40_valid238x16(float *src, float *dst, void *stream); +void LaunchTROWMIN_f32_238x40_valid121x16(float *src, float *dst, void *stream); +void LaunchTROWMIN_f32_32x256(float *src, float *dst, void *stream); +void LaunchTROWMIN_f32_8x1024(float *src, float *dst, void *stream); +void LaunchTROWMIN_i32_63x64(int32_t *src, int32_t *dst, void *stream); +void LaunchTROWMIN_i32_15x192(int32_t *src, int32_t *dst, void *stream); +void LaunchTROWMIN_i32_7x448_valid7x447(int32_t *src, int32_t *dst, void *stream); +void LaunchTROWMIN_i16_128x64(int16_t *src, int16_t *dst, void *stream); +void LaunchTROWMIN_i16_32x128(int16_t *src, int16_t *dst, void *stream); +void LaunchTROWMIN_i16_8x448(int16_t *src, int16_t *dst, void *stream); + +using LaunchFnF32 = void (*)(float *, float *, void *); +using LaunchFnF16 = void (*)(uint16_t *, uint16_t *, void *); +using LaunchFnI32 = void (*)(int32_t *, int32_t *, void *); +using LaunchFnI16 = void (*)(int16_t *, int16_t *, void *); + +enum class DType { F32, F16, I32, I16 }; + +struct TestCase { + const char *name; + DType dtype; + union { + LaunchFnF32 launchF32; + LaunchFnF16 launchF16; + LaunchFnI32 launchI32; + LaunchFnI16 launchI16; + }; + size_t rows; // allocated tile rows + size_t cols; // allocated tile cols + size_t validRows; // effective computation rows (<= rows) + size_t validCols; // effective computation cols (<= cols) + size_t elemSize; // bytes per element +}; + +static const TestCase kCases[] = { + // f32 cases +{"f32_15x192", DType::F32, .launchF32 = LaunchTROWMIN_f32_15x192, 15, 192, 15, 192, 4}, +{"i32_7x448_valid7x447", DType::I32, .launchI32 = LaunchTROWMIN_i32_7x448_valid7x447, 7, 448, 7, 447, 4}, +}; +static constexpr size_t kNumCases = sizeof(kCases) / sizeof(kCases[0]); + +static int RunCase(const TestCase &tc, int deviceId, aclrtStream stream) { + int rc = 0; + const size_t srcElemCount = tc.rows * tc.cols; + const size_t srcFileSize = srcElemCount * tc.elemSize; + const size_t dstElemCount = tc.validRows * 1; + const size_t dstFileSize = dstElemCount * tc.elemSize; + + std::printf("[INFO] === case: %s (shape=%zux%zu, valid=%zux%zu) ===\n", + tc.name, tc.rows, tc.cols, tc.validRows, tc.validCols); + + // Per-case data directory + std::string caseDir = std::string("./") + tc.name; + size_t src0FileSize = srcFileSize; + + void *src0Host = nullptr, *dstHost = nullptr; + void *src0Device = nullptr, *dstDevice = nullptr; + + aclrtMallocHost(&src0Host, srcFileSize); + aclrtMallocHost(&dstHost, dstFileSize); + + aclrtMalloc(&src0Device, srcFileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc(&dstDevice, dstFileSize, ACL_MEM_MALLOC_HUGE_FIRST); + + if (!ReadFile((caseDir + "/input1.bin").c_str(), src0FileSize, src0Host, srcFileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input1.bin\n", caseDir.c_str()); + rc = 1; + } + + if (rc == 0) { + aclrtMemcpy(src0Device, srcFileSize, src0Host, srcFileSize, ACL_MEMCPY_HOST_TO_DEVICE); + + switch (tc.dtype) { + case DType::F32: tc.launchF32((float *)src0Device, (float *)dstDevice, stream); break; + case DType::F16: tc.launchF16((uint16_t *)src0Device, (uint16_t *)dstDevice, stream); break; + case DType::I32: tc.launchI32((int32_t *)src0Device, (int32_t *)dstDevice, stream); break; + case DType::I16: tc.launchI16((int16_t *)src0Device, (int16_t *)dstDevice, stream); break; + } + + aclrtSynchronizeStream(stream); + aclrtMemcpy(dstHost, dstFileSize, dstDevice, dstFileSize, ACL_MEMCPY_DEVICE_TO_HOST); + } + + if (rc == 0 && !WriteFile((caseDir + "/output.bin").c_str(), dstHost, dstFileSize)) { + std::fprintf(stderr, "[ERROR] failed to write %s/output.bin\n", caseDir.c_str()); + rc = 1; + } + + if (src0Device != nullptr) + aclrtFree(src0Device); + if (dstDevice != nullptr) + aclrtFree(dstDevice); + if (src0Host != nullptr) + aclrtFreeHost(src0Host); + if (dstHost != nullptr) + aclrtFreeHost(dstHost); + + if (rc == 0) + std::printf("[INFO] case %s done\n", tc.name); + return rc; +} + +int main(int argc, char *argv[]) { + // Optional case filter: ./trowmin [case_name] + const char *caseFilter = (argc > 1) ? argv[1] : nullptr; + + int rc = 0; + int deviceId = 0; + aclrtStream stream = nullptr; + + aclInit(nullptr); + if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { + deviceId = std::atoi(envDevice); + } + aclrtSetDevice(deviceId); + aclrtCreateStream(&stream); + + for (size_t i = 0; i < kNumCases; ++i) { + if (caseFilter != nullptr && std::strcmp(kCases[i].name, caseFilter) != 0) { + continue; + } + int ret = RunCase(kCases[i], deviceId, stream); + if (ret != 0) { + std::fprintf(stderr, "[ERROR] case %s failed\n", kCases[i].name); + rc = 1; + break; + } + } + + if (stream != nullptr) + aclrtDestroyStream(stream); + aclrtResetDevice(deviceId); + aclFinalize(); + + return rc; +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowmin/trowmin.pto b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowmin/trowmin.pto new file mode 100644 index 000000000..3bf90b02a --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowmin/trowmin.pto @@ -0,0 +1,112 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// TileLang ST kernels for pto.trowmin: tload(src) + trowmin(src, tmp)->dst + tstore(dst). + +module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { + + // Case 0: f32 127x64 (valid=127x63) + func.func @TROWMIN_f32_15x192(%src_ptr: !pto.ptr, %dst_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c8 = arith.constant 8 : index + %c15 = arith.constant 15 : index + %c192 = arith.constant 192 : index + %c2880 = arith.constant 2880 : index + + %src_view = pto.make_tensor_view %src_ptr, + shape = [%c1, %c1, %c1, %c15, %c192], + strides = [%c2880, %c2880, %c2880, %c192, %c1] + : !pto.tensor_view<1x1x1x15x192xf32> + %dst_view = pto.make_tensor_view %dst_ptr, + shape = [%c1, %c1, %c1, %c15, %c1], + strides = [%c15, %c15, %c15, %c1, %c1] + : !pto.tensor_view<1x1x1x15x1xf32> + + %src_part = pto.partition_view %src_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c15, %c192] + : !pto.tensor_view<1x1x1x15x192xf32> -> !pto.partition_tensor_view<1x1x1x15x192xf32> + %dst_part = pto.partition_view %dst_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c15, %c1] + : !pto.tensor_view<1x1x1x15x1xf32> -> !pto.partition_tensor_view<1x1x1x15x1xf32> + + %src = pto.alloc_tile + : !pto.tile_buf + %tmp = pto.alloc_tile + : !pto.tile_buf + %dst = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%src_part : !pto.partition_tensor_view<1x1x1x15x192xf32>) + outs(%src : !pto.tile_buf) + + pto.trowmin ins(%src, %tmp : !pto.tile_buf, + !pto.tile_buf) + outs(%dst : !pto.tile_buf) + + pto.tstore ins(%dst : !pto.tile_buf) + outs(%dst_part : !pto.partition_tensor_view<1x1x1x15x1xf32>) + return + } + + // Case 4: f32 7x448 (valid=7x447) + + func.func @TROWMIN_i32_7x448_valid7x447(%src_ptr: !pto.ptr, %dst_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c8 = arith.constant 8 : index + %c7 = arith.constant 7 : index + %c447 = arith.constant 447 : index + %c448 = arith.constant 448 : index + %c3136 = arith.constant 3136 : index + + %src_view = pto.make_tensor_view %src_ptr, + shape = [%c1, %c1, %c1, %c7, %c448], + strides = [%c3136, %c3136, %c3136, %c448, %c1] + : !pto.tensor_view<1x1x1x7x448xi32> + %dst_view = pto.make_tensor_view %dst_ptr, + shape = [%c1, %c1, %c1, %c7, %c1], + strides = [%c7, %c7, %c7, %c1, %c1] + : !pto.tensor_view<1x1x1x7x1xi32> + + %src_part = pto.partition_view %src_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c7, %c447] + : !pto.tensor_view<1x1x1x7x448xi32> -> !pto.partition_tensor_view<1x1x1x7x447xi32> + %dst_part = pto.partition_view %dst_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c7, %c1] + : !pto.tensor_view<1x1x1x7x1xi32> -> !pto.partition_tensor_view<1x1x1x7x1xi32> + + %src = pto.alloc_tile + : !pto.tile_buf + %tmp = pto.alloc_tile + : !pto.tile_buf + %dst = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%src_part : !pto.partition_tensor_view<1x1x1x7x447xi32>) + outs(%src : !pto.tile_buf) + + pto.trowmin ins(%src, %tmp : !pto.tile_buf, + !pto.tile_buf) + outs(%dst : !pto.tile_buf) + + pto.tstore ins(%dst : !pto.tile_buf) + outs(%dst_part : !pto.partition_tensor_view<1x1x1x7x1xi32>) + return + } + + // ======================================================================== + // int16 cases (case24-case28) + // ======================================================================== + + // case24: i16 128x64 valid=128x64 +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowprod/CMakeLists.txt b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowprod/CMakeLists.txt new file mode 100644 index 000000000..6a30d1293 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowprod/CMakeLists.txt @@ -0,0 +1,9 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +pto_tilelang_vec_st(trowprod) diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowprod/cases.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowprod/cases.py new file mode 100644 index 000000000..f3c666df1 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowprod/cases.py @@ -0,0 +1,160 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +"""Single source of truth for trowprod ST test cases. + +Aligned with pto-isa tests/npu/a5/src/st/testcase/trowprod (18 cases). +""" + +import numpy as np + +CASES = [ + # f32 cases (case1-case5 from pto-isa) + { + "name": "f32_127x64_valid127x63", + "dtype": np.float32, + "shape": (127, 64), + "valid_shape": (127, 63), + "eps": 1e-3, + }, + { + "name": "f32_63x64", + "dtype": np.float32, + "shape": (63, 64), + "valid_shape": (63, 64), + "eps": 1e-3, + }, + { + "name": "f32_31x128_valid31x127", + "dtype": np.float32, + "shape": (31, 128), + "valid_shape": (31, 127), + "eps": 1e-3, + }, + { + "name": "f32_15x192", + "dtype": np.float32, + "shape": (15, 192), + "valid_shape": (15, 192), + "eps": 1e-3, + }, + { + "name": "f32_7x448_valid7x447", + "dtype": np.float32, + "shape": (7, 448), + "valid_shape": (7, 447), + "eps": 1e-3, + }, + # f16 case (case6 from pto-isa) + { + "name": "f16_256x16_valid256x15", + "dtype": np.float16, + "shape": (256, 16), + "valid_shape": (256, 15), + "eps": 1e-1, + }, + # f32 DN dst cases (case7-case10 from pto-isa) + { + "name": "f32_64x128", + "dtype": np.float32, + "shape": (64, 128), + "valid_shape": (64, 128), + "eps": 1e-3, + }, + { + "name": "f32_32x256", + "dtype": np.float32, + "shape": (32, 256), + "valid_shape": (32, 256), + "eps": 1e-3, + }, + { + "name": "f32_16x512", + "dtype": np.float32, + "shape": (16, 512), + "valid_shape": (16, 512), + "eps": 1e-3, + }, + { + "name": "f32_8x1024", + "dtype": np.float32, + "shape": (8, 1024), + "valid_shape": (8, 1024), + "eps": 1e-3, + }, + + # int32 cases (case11-case15 from pto-isa) + { + "name": "i32_127x64_valid127x63", + "dtype": np.int32, + "shape": (127, 64), + "valid_shape": (127, 63), + "eps": 0, + }, + { + "name": "i32_63x64", + "dtype": np.int32, + "shape": (63, 64), + "valid_shape": (63, 64), + "eps": 0, + }, + { + "name": "i32_31x128_valid31x127", + "dtype": np.int32, + "shape": (31, 128), + "valid_shape": (31, 127), + "eps": 0, + }, + { + "name": "i32_15x192", + "dtype": np.int32, + "shape": (15, 192), + "valid_shape": (15, 192), + "eps": 0, + }, + { + "name": "i32_7x448_valid7x447", + "dtype": np.int32, + "shape": (7, 448), + "valid_shape": (7, 447), + "eps": 0, + }, + + # int16 cases (case16-case18 from pto-isa) + { + "name": "i16_256x16_valid256x15", + "dtype": np.int16, + "shape": (256, 16), + "valid_shape": (256, 15), + "eps": 0, + }, + { + "name": "i16_63x64", + "dtype": np.int16, + "shape": (63, 64), + "valid_shape": (63, 64), + "eps": 0, + }, + { + "name": "i16_31x128_valid31x127", + "dtype": np.int16, + "shape": (31, 128), + "valid_shape": (31, 127), + "eps": 0, + }, +] + +_SMOKE_CASE_NAMES = ['f32_15x192', 'i32_7x448_valid7x447'] +_SMOKE_CASE_NAME_SET = set(_SMOKE_CASE_NAMES) +_missing = [name for name in _SMOKE_CASE_NAMES if name not in {case["name"] for case in CASES}] +if _missing: + raise RuntimeError("unknown smoke case(s): " + ", ".join(_missing)) +CASES = [case for case in CASES if case["name"] in _SMOKE_CASE_NAME_SET] diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowprod/compare.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowprod/compare.py new file mode 100644 index 000000000..12d4207bd --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowprod/compare.py @@ -0,0 +1,49 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import os +import sys +import numpy as np + +from cases import CASES +from st_common import result_cmp, style_fail, style_pass, validate_cases + + +def main(): + validate_cases(CASES) + case_filter = sys.argv[1] if len(sys.argv) > 1 else None + + all_passed = True + for case in CASES: + if case_filter is not None and case["name"] != case_filter: + continue + + case_dir = case["name"] + vr, vc = case["valid_shape"] + out_shape = (vr,) + + golden = np.fromfile(os.path.join(case_dir, "golden.bin"), dtype=case["dtype"], count=np.prod(out_shape)).reshape(out_shape) + output = np.fromfile(os.path.join(case_dir, "output.bin"), dtype=case["dtype"], count=np.prod(out_shape)).reshape(out_shape) + + ok = result_cmp(golden, output, case["eps"]) + if ok: + print(style_pass(f"[INFO] {case['name']}: compare passed")) + else: + print(style_fail(f"[ERROR] {case['name']}: compare failed")) + all_passed = False + + if not all_passed: + sys.exit(2) + print(style_pass("[INFO] all cases passed")) + + +if __name__ == "__main__": + main() diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowprod/gen_data.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowprod/gen_data.py new file mode 100644 index 000000000..b1f6092af --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowprod/gen_data.py @@ -0,0 +1,42 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import numpy as np +from cases import CASES +from st_common import validate_cases, setup_case_rng, save_case_data + +validate_cases(CASES) + +for case in CASES: + setup_case_rng(case) + + dtype = case["dtype"] + shape = case["shape"] + valid_shape = case["valid_shape"] + + if np.issubdtype(dtype, np.integer): + if dtype == np.int32: + input1 = np.random.randint(low=-3, high=4, size=shape).astype(dtype) + else: + input1 = np.random.randint(low=-2, high=3, size=shape).astype(dtype) + else: + input1 = np.random.uniform(low=0.9, high=1.1, size=shape).astype(dtype) + + out_shape = (valid_shape[0],) + golden = np.ones(out_shape, dtype=dtype) + vr, vc = valid_shape + for i in range(vr): + for j in range(vc): + golden[i] *= input1[i, j] + + golden = golden.astype(dtype, copy=False) + save_case_data(case["name"], {"input1": input1, "golden": golden}) + print(f"[INFO] gen_data: {case['name']} shape={shape} valid_shape={valid_shape} dtype={dtype.__name__}") diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowprod/launch.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowprod/launch.cpp new file mode 100644 index 000000000..70868d73b --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowprod/launch.cpp @@ -0,0 +1,26 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +#include + +#ifndef AICORE +#define AICORE [aicore] +#endif + +extern "C" __global__ AICORE void TROWPROD_f32_15x192(__gm__ float *src, __gm__ float *dst); +extern "C" __global__ AICORE void TROWPROD_i32_7x448_valid7x447(__gm__ int32_t *src, __gm__ int32_t *dst); + +void LaunchTROWPROD_i32_7x448_valid7x447(int32_t *src, int32_t *dst, void *stream) { + TROWPROD_i32_7x448_valid7x447<<<1, nullptr, stream>>>((__gm__ int32_t *)src, (__gm__ int32_t *)dst); +} + + + +void LaunchTROWPROD_f32_15x192(float *src, float *dst, void *stream) { + TROWPROD_f32_15x192<<<1, nullptr, stream>>>((__gm__ float *)src, (__gm__ float *)dst); +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowprod/main.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowprod/main.cpp new file mode 100644 index 000000000..396a576c3 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowprod/main.cpp @@ -0,0 +1,158 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// Host driver for TileLang trowprod ST — case-table driven. + +#include "acl/acl.h" +#include "test_common.h" +#include +#include +#include +#include +#include +#include + +using namespace PtoTestCommon; + +// Kernel launch wrappers (defined in launch.cpp) +void LaunchTROWPROD_f32_63x64(float *src, float *dst, void *stream); +void LaunchTROWPROD_f32_15x192(float *src, float *dst, void *stream); +void LaunchTROWPROD_f16_256x16_valid256x15(uint16_t *src, uint16_t *dst, void *stream); +void LaunchTROWPROD_f32_32x256(float *src, float *dst, void *stream); +void LaunchTROWPROD_f32_8x1024(float *src, float *dst, void *stream); +void LaunchTROWPROD_i32_63x64(int32_t *src, int32_t *dst, void *stream); +void LaunchTROWPROD_i32_15x192(int32_t *src, int32_t *dst, void *stream); +void LaunchTROWPROD_i32_7x448_valid7x447(int32_t *src, int32_t *dst, void *stream); +void LaunchTROWPROD_i16_256x16_valid256x15(int16_t *src, int16_t *dst, void *stream); +void LaunchTROWPROD_i16_31x128_valid31x127(int16_t *src, int16_t *dst, void *stream); + +using LaunchFnF32 = void (*)(float *, float *, void *); +using LaunchFnF16 = void (*)(uint16_t *, uint16_t *, void *); +using LaunchFnI32 = void (*)(int32_t *, int32_t *, void *); +using LaunchFnI16 = void (*)(int16_t *, int16_t *, void *); + +enum class DType { F32, F16, I32, I16 }; + +struct TestCase { + const char *name; + DType dtype; + union { + LaunchFnF32 launchF32; + LaunchFnF16 launchF16; + LaunchFnI32 launchI32; + LaunchFnI16 launchI16; + }; + size_t rows; // allocated tile rows + size_t cols; // allocated tile cols + size_t validRows; // effective computation rows (<= rows) + size_t validCols; // effective computation cols (<= cols) + size_t elemSize; // bytes per element +}; + +static const TestCase kCases[] = { + // f32 cases +{"f32_15x192", DType::F32, .launchF32 = LaunchTROWPROD_f32_15x192, 15, 192, 15, 192, 4}, +{"i32_7x448_valid7x447", DType::I32, .launchI32 = LaunchTROWPROD_i32_7x448_valid7x447, 7, 448, 7, 447, 4}, +}; +static constexpr size_t kNumCases = sizeof(kCases) / sizeof(kCases[0]); + +static int RunCase(const TestCase &tc, int deviceId, aclrtStream stream) { + int rc = 0; + const size_t srcElemCount = tc.rows * tc.cols; + const size_t srcFileSize = srcElemCount * tc.elemSize; + const size_t dstElemCount = tc.validRows * 1; + const size_t dstFileSize = dstElemCount * tc.elemSize; + + std::printf("[INFO] === case: %s (shape=%zux%zu, valid=%zux%zu) ===\n", + tc.name, tc.rows, tc.cols, tc.validRows, tc.validCols); + + // Per-case data directory + std::string caseDir = std::string("./") + tc.name; + size_t src0FileSize = srcFileSize; + + void *src0Host = nullptr, *dstHost = nullptr; + void *src0Device = nullptr, *dstDevice = nullptr; + + aclrtMallocHost(&src0Host, srcFileSize); + aclrtMallocHost(&dstHost, dstFileSize); + + aclrtMalloc(&src0Device, srcFileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc(&dstDevice, dstFileSize, ACL_MEM_MALLOC_HUGE_FIRST); + + if (!ReadFile((caseDir + "/input1.bin").c_str(), src0FileSize, src0Host, srcFileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input1.bin\n", caseDir.c_str()); + rc = 1; + } + + if (rc == 0) { + aclrtMemcpy(src0Device, srcFileSize, src0Host, srcFileSize, ACL_MEMCPY_HOST_TO_DEVICE); + + switch (tc.dtype) { + case DType::F32: tc.launchF32((float *)src0Device, (float *)dstDevice, stream); break; + case DType::F16: tc.launchF16((uint16_t *)src0Device, (uint16_t *)dstDevice, stream); break; + case DType::I32: tc.launchI32((int32_t *)src0Device, (int32_t *)dstDevice, stream); break; + case DType::I16: tc.launchI16((int16_t *)src0Device, (int16_t *)dstDevice, stream); break; + } + + aclrtSynchronizeStream(stream); + aclrtMemcpy(dstHost, dstFileSize, dstDevice, dstFileSize, ACL_MEMCPY_DEVICE_TO_HOST); + } + + if (rc == 0 && !WriteFile((caseDir + "/output.bin").c_str(), dstHost, dstFileSize)) { + std::fprintf(stderr, "[ERROR] failed to write %s/output.bin\n", caseDir.c_str()); + rc = 1; + } + + if (src0Device != nullptr) + aclrtFree(src0Device); + if (dstDevice != nullptr) + aclrtFree(dstDevice); + if (src0Host != nullptr) + aclrtFreeHost(src0Host); + if (dstHost != nullptr) + aclrtFreeHost(dstHost); + + if (rc == 0) + std::printf("[INFO] case %s done\n", tc.name); + return rc; +} + +int main(int argc, char *argv[]) { + // Optional case filter: ./trowprod [case_name] + const char *caseFilter = (argc > 1) ? argv[1] : nullptr; + + int rc = 0; + int deviceId = 0; + aclrtStream stream = nullptr; + + aclInit(nullptr); + if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { + deviceId = std::atoi(envDevice); + } + aclrtSetDevice(deviceId); + aclrtCreateStream(&stream); + + for (size_t i = 0; i < kNumCases; ++i) { + if (caseFilter != nullptr && std::strcmp(kCases[i].name, caseFilter) != 0) { + continue; + } + int ret = RunCase(kCases[i], deviceId, stream); + if (ret != 0) { + std::fprintf(stderr, "[ERROR] case %s failed\n", kCases[i].name); + rc = 1; + break; + } + } + + if (stream != nullptr) + aclrtDestroyStream(stream); + aclrtResetDevice(deviceId); + aclFinalize(); + + return rc; +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowprod/trowprod.pto b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowprod/trowprod.pto new file mode 100644 index 000000000..50db2ccea --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowprod/trowprod.pto @@ -0,0 +1,112 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// TileLang ST kernels for pto.trowprod: tload(src) + trowprod(src, tmp)->dst + tstore(dst). + +module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { + + // Case 0: f32 127x64 (valid=127x63) + func.func @TROWPROD_f32_15x192(%src_ptr: !pto.ptr, %dst_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c8 = arith.constant 8 : index + %c15 = arith.constant 15 : index + %c192 = arith.constant 192 : index + %c2880 = arith.constant 2880 : index + + %src_view = pto.make_tensor_view %src_ptr, + shape = [%c1, %c1, %c1, %c15, %c192], + strides = [%c2880, %c2880, %c2880, %c192, %c1] + : !pto.tensor_view<1x1x1x15x192xf32> + %dst_view = pto.make_tensor_view %dst_ptr, + shape = [%c1, %c1, %c1, %c15, %c1], + strides = [%c15, %c15, %c15, %c1, %c1] + : !pto.tensor_view<1x1x1x15x1xf32> + + %src_part = pto.partition_view %src_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c15, %c192] + : !pto.tensor_view<1x1x1x15x192xf32> -> !pto.partition_tensor_view<1x1x1x15x192xf32> + %dst_part = pto.partition_view %dst_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c15, %c1] + : !pto.tensor_view<1x1x1x15x1xf32> -> !pto.partition_tensor_view<1x1x1x15x1xf32> + + %src = pto.alloc_tile + : !pto.tile_buf + %tmp = pto.alloc_tile + : !pto.tile_buf + %dst = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%src_part : !pto.partition_tensor_view<1x1x1x15x192xf32>) + outs(%src : !pto.tile_buf) + + pto.trowprod ins(%src, %tmp : !pto.tile_buf, + !pto.tile_buf) + outs(%dst : !pto.tile_buf) + + pto.tstore ins(%dst : !pto.tile_buf) + outs(%dst_part : !pto.partition_tensor_view<1x1x1x15x1xf32>) + return + } + + // Case 4: f32 7x448 (valid=7x447) + + func.func @TROWPROD_i32_7x448_valid7x447(%src_ptr: !pto.ptr, %dst_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c8 = arith.constant 8 : index + %c7 = arith.constant 7 : index + %c447 = arith.constant 447 : index + %c448 = arith.constant 448 : index + %c3136 = arith.constant 3136 : index + + %src_view = pto.make_tensor_view %src_ptr, + shape = [%c1, %c1, %c1, %c7, %c448], + strides = [%c3136, %c3136, %c3136, %c448, %c1] + : !pto.tensor_view<1x1x1x7x448xi32> + %dst_view = pto.make_tensor_view %dst_ptr, + shape = [%c1, %c1, %c1, %c7, %c1], + strides = [%c7, %c7, %c7, %c1, %c1] + : !pto.tensor_view<1x1x1x7x1xi32> + + %src_part = pto.partition_view %src_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c7, %c447] + : !pto.tensor_view<1x1x1x7x448xi32> -> !pto.partition_tensor_view<1x1x1x7x447xi32> + %dst_part = pto.partition_view %dst_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c7, %c1] + : !pto.tensor_view<1x1x1x7x1xi32> -> !pto.partition_tensor_view<1x1x1x7x1xi32> + + %src = pto.alloc_tile + : !pto.tile_buf + %tmp = pto.alloc_tile + : !pto.tile_buf + %dst = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%src_part : !pto.partition_tensor_view<1x1x1x7x447xi32>) + outs(%src : !pto.tile_buf) + + pto.trowprod ins(%src, %tmp : !pto.tile_buf, + !pto.tile_buf) + outs(%dst : !pto.tile_buf) + + pto.tstore ins(%dst : !pto.tile_buf) + outs(%dst_part : !pto.partition_tensor_view<1x1x1x7x1xi32>) + return + } + + // ======================================================================== + // int16 cases (case16-case18) + // ======================================================================== + + // case16: i16 256x16 valid=256x15 +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowsum/CMakeLists.txt b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowsum/CMakeLists.txt new file mode 100644 index 000000000..bcb316bcc --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowsum/CMakeLists.txt @@ -0,0 +1,9 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +pto_tilelang_vec_st(trowsum) diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowsum/cases.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowsum/cases.py new file mode 100644 index 000000000..cabbf197e --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowsum/cases.py @@ -0,0 +1,181 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +"""Single source of truth for trowsum ST test cases. + +Aligned with pto-isa tests/npu/a5/src/st/testcase/trowsum (20 cases). +""" + +import numpy as np + +CASES = [ + # f32 cases (case1-case10 from pto-isa) + { + "name": "f32_127x64_valid127x63", + "dtype": np.float32, + "shape": (127, 64), + "valid_shape": (127, 63), + "eps": 1e-3, + }, + { + "name": "f32_63x64", + "dtype": np.float32, + "shape": (63, 64), + "valid_shape": (63, 64), + "eps": 1e-3, + }, + { + "name": "f32_31x128_valid31x127", + "dtype": np.float32, + "shape": (31, 128), + "valid_shape": (31, 127), + "eps": 1e-3, + }, + { + "name": "f32_15x192", + "dtype": np.float32, + "shape": (15, 192), + "valid_shape": (15, 192), + "eps": 1e-3, + }, + { + "name": "f32_7x448_valid7x447", + "dtype": np.float32, + "shape": (7, 448), + "valid_shape": (7, 447), + "eps": 1e-3, + }, + { + "name": "f16_256x16_valid256x15", + "dtype": np.float16, + "shape": (256, 16), + "valid_shape": (256, 15), + "eps": 5e-3, + }, + { + "name": "f32_64x128", + "dtype": np.float32, + "shape": (64, 128), + "valid_shape": (64, 128), + "eps": 1e-3, + }, + { + "name": "f32_32x256", + "dtype": np.float32, + "shape": (32, 256), + "valid_shape": (32, 256), + "eps": 1e-3, + }, + { + "name": "f32_16x512", + "dtype": np.float32, + "shape": (16, 512), + "valid_shape": (16, 512), + "eps": 1e-3, + }, + { + "name": "f32_8x1024", + "dtype": np.float32, + "shape": (8, 1024), + "valid_shape": (8, 1024), + "eps": 1e-3, + }, + + # int32 cases (case11-case15 from pto-isa) + { + "name": "i32_127x64_valid127x63", + "dtype": np.int32, + "shape": (127, 64), + "valid_shape": (127, 63), + "eps": 0, + }, + { + "name": "i32_63x64", + "dtype": np.int32, + "shape": (63, 64), + "valid_shape": (63, 64), + "eps": 0, + }, + { + "name": "i32_31x128_valid31x127", + "dtype": np.int32, + "shape": (31, 128), + "valid_shape": (31, 127), + "eps": 0, + }, + { + "name": "i32_15x192", + "dtype": np.int32, + "shape": (15, 192), + "valid_shape": (15, 192), + "eps": 0, + }, + { + "name": "i32_7x448_valid7x447", + "dtype": np.int32, + "shape": (7, 448), + "valid_shape": (7, 447), + "eps": 0, + }, + + # int16 cases (case16-case20 from pto-isa) + { + "name": "i16_128x64", + "dtype": np.int16, + "shape": (128, 64), + "valid_shape": (128, 64), + "eps": 0, + }, + { + "name": "i16_64x64", + "dtype": np.int16, + "shape": (64, 64), + "valid_shape": (64, 64), + "eps": 0, + }, + { + "name": "i16_32x128", + "dtype": np.int16, + "shape": (32, 128), + "valid_shape": (32, 128), + "eps": 0, + }, + { + "name": "i16_16x192", + "dtype": np.int16, + "shape": (16, 192), + "valid_shape": (16, 192), + "eps": 0, + }, + { + "name": "i16_8x448", + "dtype": np.int16, + "shape": (8, 448), + "valid_shape": (8, 448), + "eps": 0, + }, + # i16 overflow case to test vcvt NOSAT behavior + { + "name": "i16_1x64_overflow", + "dtype": np.int16, + "shape": (1, 64), + "valid_shape": (1, 64), + "eps": 0, + "overflow": True, + }, +] + +_SMOKE_CASE_NAMES = ['f32_15x192', 'i16_1x64_overflow'] +_SMOKE_CASE_NAME_SET = set(_SMOKE_CASE_NAMES) +_missing = [name for name in _SMOKE_CASE_NAMES if name not in {case["name"] for case in CASES}] +if _missing: + raise RuntimeError("unknown smoke case(s): " + ", ".join(_missing)) +CASES = [case for case in CASES if case["name"] in _SMOKE_CASE_NAME_SET] diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowsum/compare.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowsum/compare.py new file mode 100644 index 000000000..b80e2549b --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowsum/compare.py @@ -0,0 +1,49 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import os +import sys +import numpy as np + +from cases import CASES +from st_common import result_cmp, style_fail, style_pass, validate_cases + + +def main(): + validate_cases(CASES) + case_filter = sys.argv[1] if len(sys.argv) > 1 else None + + all_passed = True + for case in CASES: + if case_filter is not None and case["name"] != case_filter: + continue + + case_dir = case["name"] + vr, vc = case["valid_shape"] + out_shape = (vr, 1) + + golden = np.fromfile(os.path.join(case_dir, "golden.bin"), dtype=case["dtype"], count=np.prod(out_shape)).reshape(out_shape) + output = np.fromfile(os.path.join(case_dir, "output.bin"), dtype=case["dtype"], count=np.prod(out_shape)).reshape(out_shape) + + ok = result_cmp(golden, output, case["eps"]) + if ok: + print(style_pass(f"[INFO] {case['name']}: compare passed")) + else: + print(style_fail(f"[ERROR] {case['name']}: compare failed")) + all_passed = False + + if not all_passed: + sys.exit(2) + print(style_pass("[INFO] all cases passed")) + + +if __name__ == "__main__": + main() diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowsum/gen_data.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowsum/gen_data.py new file mode 100644 index 000000000..0a7041c34 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowsum/gen_data.py @@ -0,0 +1,50 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import os +import numpy as np +from cases import CASES +from st_common import validate_cases, save_case_data + +validate_cases(CASES) + +np.random.seed(42) + +for case in CASES: + dtype = case["dtype"] + row = case["shape"][0] + valid_row = case["valid_shape"][0] + col = case["shape"][1] + valid_col = case["valid_shape"][1] + + if np.issubdtype(dtype, np.integer): + if dtype == np.int32: + input_arr = np.random.randint(low=-100, high=100, size=(row, col)).astype(dtype) + elif dtype == np.int16: + if case.get("overflow"): + # Generate values that cause overflow when summed to test NOSAT behavior + # 1000 * 64 = 64000 > 32767, wraps to -1536 in int16 + input_arr = np.full((row, col), 1000, dtype=dtype) + else: + input_arr = np.random.randint(low=-50, high=50, size=(row, col)).astype(dtype) + else: + input_arr = np.random.randint(low=-10, high=10, size=(row, col)).astype(dtype) + else: + input_arr = np.random.uniform(low=-1, high=1, size=(row, col)).astype(dtype) + + output_arr = np.zeros((row,), dtype=np.int64 if np.issubdtype(dtype, np.integer) else np.float64) + for i in range(valid_row): + for j in range(valid_col): + output_arr[i] += int(input_arr[i, j]) if np.issubdtype(dtype, np.integer) else input_arr[i, j] + output_arr = output_arr.astype(dtype) + + save_case_data(case["name"], {"input": input_arr, "golden": output_arr}) + print(f"[INFO] gen_data: {case['name']} shape=({row},{col}) valid=({valid_row},{valid_col}) dtype={dtype.__name__}") diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowsum/launch.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowsum/launch.cpp new file mode 100644 index 000000000..574ec1c78 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowsum/launch.cpp @@ -0,0 +1,29 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +#include + +#ifndef AICORE +#define AICORE [aicore] +#endif + +// ======================================================================== +// f32 kernels +// ======================================================================== + +extern "C" __global__ AICORE void TROWSUM_f32_15x192(__gm__ float *src, __gm__ float *dst); +extern "C" __global__ AICORE void TROWSUM_i16_1x64_overflow(__gm__ int16_t *src, __gm__ int16_t *dst); + +void LaunchTROWSUM_f32_15x192(float *src, float *dst, void *stream) { + TROWSUM_f32_15x192<<<1, nullptr, stream>>>((__gm__ float *)src, (__gm__ float *)dst); +} + + +void LaunchTROWSUM_i16_1x64_overflow(int16_t *src, int16_t *dst, void *stream) { + TROWSUM_i16_1x64_overflow<<<1, nullptr, stream>>>((__gm__ int16_t *)src, (__gm__ int16_t *)dst); +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowsum/main.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowsum/main.cpp new file mode 100644 index 000000000..9a69947f4 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowsum/main.cpp @@ -0,0 +1,159 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// Host driver for TileLang trowsum ST — case-table driven. + +#include "acl/acl.h" +#include "test_common.h" +#include +#include +#include +#include +#include +#include + +using namespace PtoTestCommon; + +// Kernel launch wrappers (defined in launch.cpp) +void LaunchTROWSUM_f32_63x64(float *src, float *dst, void *stream); +void LaunchTROWSUM_f32_15x192(float *src, float *dst, void *stream); +void LaunchTROWSUM_f16_256x16_valid256x15(uint16_t *src, uint16_t *dst, void *stream); +void LaunchTROWSUM_f32_32x256(float *src, float *dst, void *stream); +void LaunchTROWSUM_f32_8x1024(float *src, float *dst, void *stream); +void LaunchTROWSUM_i32_63x64(int32_t *src, int32_t *dst, void *stream); +void LaunchTROWSUM_i32_15x192(int32_t *src, int32_t *dst, void *stream); +void LaunchTROWSUM_i16_128x64(int16_t *src, int16_t *dst, void *stream); +void LaunchTROWSUM_i16_32x128(int16_t *src, int16_t *dst, void *stream); +void LaunchTROWSUM_i16_8x448(int16_t *src, int16_t *dst, void *stream); +void LaunchTROWSUM_i16_1x64_overflow(int16_t *src, int16_t *dst, void *stream); + +using LaunchFnF32 = void (*)(float *, float *, void *); +using LaunchFnF16 = void (*)(uint16_t *, uint16_t *, void *); +using LaunchFnI32 = void (*)(int32_t *, int32_t *, void *); +using LaunchFnI16 = void (*)(int16_t *, int16_t *, void *); + +enum class DType { F32, F16, I32, I16 }; + +struct TestCase { + const char *name; + DType dtype; + union { + LaunchFnF32 launchF32; + LaunchFnF16 launchF16; + LaunchFnI32 launchI32; + LaunchFnI16 launchI16; + }; + size_t rows; // allocated tile rows + size_t cols; // allocated tile cols + size_t validRows; // effective computation rows (<= rows) + size_t validCols; // effective computation cols (<= cols) + size_t elemSize; // bytes per element +}; + +static const TestCase kCases[] = { + // f32 cases +{"f32_15x192", DType::F32, .launchF32 = LaunchTROWSUM_f32_15x192, 15, 192, 15, 192, 4}, +{"i16_1x64_overflow", DType::I16, .launchI16 = LaunchTROWSUM_i16_1x64_overflow, 1, 64, 1, 64, 2}, +}; +static constexpr size_t kNumCases = sizeof(kCases) / sizeof(kCases[0]); + +static int RunCase(const TestCase &tc, int deviceId, aclrtStream stream) { + int rc = 0; + const size_t srcElemCount = tc.rows * tc.cols; + const size_t srcFileSize = srcElemCount * tc.elemSize; + const size_t dstElemCount = tc.validRows * 1; + const size_t dstFileSize = dstElemCount * tc.elemSize; + + std::printf("[INFO] === case: %s (shape=%zux%zu, valid=%zux%zu) ===\n", + tc.name, tc.rows, tc.cols, tc.validRows, tc.validCols); + + // Per-case data directory + std::string caseDir = std::string("./") + tc.name; + size_t src0FileSize = srcFileSize; + + void *src0Host = nullptr, *dstHost = nullptr; + void *src0Device = nullptr, *dstDevice = nullptr; + + aclrtMallocHost(&src0Host, srcFileSize); + aclrtMallocHost(&dstHost, dstFileSize); + + aclrtMalloc(&src0Device, srcFileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc(&dstDevice, dstFileSize, ACL_MEM_MALLOC_HUGE_FIRST); + + if (!ReadFile((caseDir + "/input.bin").c_str(), src0FileSize, src0Host, srcFileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input.bin\n", caseDir.c_str()); + rc = 1; + } + + if (rc == 0) { + aclrtMemcpy(src0Device, srcFileSize, src0Host, srcFileSize, ACL_MEMCPY_HOST_TO_DEVICE); + + switch (tc.dtype) { + case DType::F32: tc.launchF32((float *)src0Device, (float *)dstDevice, stream); break; + case DType::F16: tc.launchF16((uint16_t *)src0Device, (uint16_t *)dstDevice, stream); break; + case DType::I32: tc.launchI32((int32_t *)src0Device, (int32_t *)dstDevice, stream); break; + case DType::I16: tc.launchI16((int16_t *)src0Device, (int16_t *)dstDevice, stream); break; + } + + aclrtSynchronizeStream(stream); + aclrtMemcpy(dstHost, dstFileSize, dstDevice, dstFileSize, ACL_MEMCPY_DEVICE_TO_HOST); + } + + if (rc == 0 && !WriteFile((caseDir + "/output.bin").c_str(), dstHost, dstFileSize)) { + std::fprintf(stderr, "[ERROR] failed to write %s/output.bin\n", caseDir.c_str()); + rc = 1; + } + + if (src0Device != nullptr) + aclrtFree(src0Device); + if (dstDevice != nullptr) + aclrtFree(dstDevice); + if (src0Host != nullptr) + aclrtFreeHost(src0Host); + if (dstHost != nullptr) + aclrtFreeHost(dstHost); + + if (rc == 0) + std::printf("[INFO] case %s done\n", tc.name); + return rc; +} + +int main(int argc, char *argv[]) { + // Optional case filter: ./trowsum [case_name] + const char *caseFilter = (argc > 1) ? argv[1] : nullptr; + + int rc = 0; + int deviceId = 0; + aclrtStream stream = nullptr; + + aclInit(nullptr); + if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { + deviceId = std::atoi(envDevice); + } + aclrtSetDevice(deviceId); + aclrtCreateStream(&stream); + + for (size_t i = 0; i < kNumCases; ++i) { + if (caseFilter != nullptr && std::strcmp(kCases[i].name, caseFilter) != 0) { + continue; + } + int ret = RunCase(kCases[i], deviceId, stream); + if (ret != 0) { + std::fprintf(stderr, "[ERROR] case %s failed\n", kCases[i].name); + rc = 1; + break; + } + } + + if (stream != nullptr) + aclrtDestroyStream(stream); + aclrtResetDevice(deviceId); + aclFinalize(); + + return rc; +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowsum/trowsum.pto b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowsum/trowsum.pto new file mode 100644 index 000000000..2eacbd9df --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trowsum/trowsum.pto @@ -0,0 +1,114 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// TileLang ST kernels for pto.trowsum: tload(src) + trowsum(src, tmp)->dst + tstore(dst). +// Aligned with pto-isa tests/npu/a5/src/st/testcase/trowsum (20 cases) + 1 overflow case. + +module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { + + // ======================================================================== + // f32 cases + // ======================================================================== + + // f32_127x64_valid127x63 + func.func @TROWSUM_f32_15x192(%src_ptr: !pto.ptr, %dst_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c15 = arith.constant 15 : index + %c192 = arith.constant 192 : index + %c2880 = arith.constant 2880 : index + + %src_view = pto.make_tensor_view %src_ptr, + shape = [%c1, %c1, %c1, %c15, %c192], + strides = [%c2880, %c2880, %c2880, %c192, %c1] + : !pto.tensor_view<1x1x1x15x192xf32> + %dst_view = pto.make_tensor_view %dst_ptr, + shape = [%c1, %c1, %c1, %c15, %c1], + strides = [%c15, %c15, %c15, %c1, %c1] + : !pto.tensor_view<1x1x1x15x1xf32> + + %src_part = pto.partition_view %src_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c15, %c192] + : !pto.tensor_view<1x1x1x15x192xf32> -> !pto.partition_tensor_view<1x1x1x15x192xf32> + %dst_part = pto.partition_view %dst_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c15, %c1] + : !pto.tensor_view<1x1x1x15x1xf32> -> !pto.partition_tensor_view<1x1x1x15x1xf32> + + %src = pto.alloc_tile + : !pto.tile_buf + %tmp = pto.alloc_tile + : !pto.tile_buf + %dst = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%src_part : !pto.partition_tensor_view<1x1x1x15x192xf32>) + outs(%src : !pto.tile_buf) + + pto.trowsum ins(%src, %tmp : !pto.tile_buf, + !pto.tile_buf) + outs(%dst : !pto.tile_buf) + + pto.tstore ins(%dst : !pto.tile_buf) + outs(%dst_part : !pto.partition_tensor_view<1x1x1x15x1xf32>) + return + } + + // f32_7x448_valid7x447 + + func.func @TROWSUM_i16_1x64_overflow(%src_ptr: !pto.ptr, %dst_ptr: !pto.ptr) attributes { pto.entry , pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c64 = arith.constant 64 : index + + %src_view = pto.make_tensor_view %src_ptr, + shape = [%c1, %c1, %c1, %c1, %c64], + strides = [%c64, %c64, %c64, %c64, %c1] + : !pto.tensor_view<1x1x1x1x64xi16> + %dst_view = pto.make_tensor_view %dst_ptr, + shape = [%c1, %c1, %c1, %c1, %c1], + strides = [%c1, %c1, %c1, %c1, %c1] + : !pto.tensor_view<1x1x1x1x1xi16> + + %src_part = pto.partition_view %src_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c1, %c64] + : !pto.tensor_view<1x1x1x1x64xi16> -> !pto.partition_tensor_view<1x1x1x1x64xi16> + %dst_part = pto.partition_view %dst_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c1, %c1] + : !pto.tensor_view<1x1x1x1x1xi16> -> !pto.partition_tensor_view<1x1x1x1x1xi16> + + %src = pto.alloc_tile + : !pto.tile_buf + %tmp = pto.alloc_tile + : !pto.tile_buf + %dst = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%src_part : !pto.partition_tensor_view<1x1x1x1x64xi16>) + outs(%src : !pto.tile_buf) + + pto.trowsum ins(%src, %tmp : !pto.tile_buf, + !pto.tile_buf) + outs(%dst : !pto.tile_buf) + + pto.tstore ins(%dst : !pto.tile_buf) + outs(%dst_part : !pto.partition_tensor_view<1x1x1x1x1xi16>) + return + } +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trsqrt/CMakeLists.txt b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trsqrt/CMakeLists.txt new file mode 100644 index 000000000..7209977f8 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trsqrt/CMakeLists.txt @@ -0,0 +1,9 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +pto_tilelang_vec_st(trsqrt) diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trsqrt/cases.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trsqrt/cases.py new file mode 100644 index 000000000..ce3fcf75b --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trsqrt/cases.py @@ -0,0 +1,62 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +"""Single source of truth for trsqrt ST test cases. + +Each case defines: + - name: case identifier, used as subdirectory name and by main.cpp kCases[]. + - dtype: numpy dtype (e.g. np.float32). + - shape: (rows, cols) — allocated tile dimensions. + - valid_shape: (valid_rows, valid_cols) — effective computation region. + - eps: tolerance for numpy.allclose (atol and rtol). + +gen_data.py and compare.py both import this list to avoid redundant definitions. +""" + +import numpy as np + +CASES = [ + { + "name": "f32_16x64", + "dtype": np.float32, + "shape": (16, 64), + "valid_shape": (16, 64), + "eps": 1e-6, + }, + { + "name": "f32_32x32", + "dtype": np.float32, + "shape": (32, 32), + "valid_shape": (32, 32), + "eps": 1e-6, + }, + { + "name": "f16_16x64", + "dtype": np.float16, + "shape": (16, 64), + "valid_shape": (16, 64), + "eps": 1e-3, + }, + { + "name": "f16_32x32", + "dtype": np.float16, + "shape": (32, 32), + "valid_shape": (32, 32), + "eps": 1e-3, + }, +] + +_SMOKE_CASE_NAMES = ['f32_16x64', 'f16_16x64'] +_SMOKE_CASE_NAME_SET = set(_SMOKE_CASE_NAMES) +_missing = [name for name in _SMOKE_CASE_NAMES if name not in {case["name"] for case in CASES}] +if _missing: + raise RuntimeError("unknown smoke case(s): " + ", ".join(_missing)) +CASES = [case for case in CASES if case["name"] in _SMOKE_CASE_NAME_SET] diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trsqrt/compare.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trsqrt/compare.py new file mode 100644 index 000000000..428604929 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trsqrt/compare.py @@ -0,0 +1,49 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import os +import sys +import numpy as np + +from cases import CASES +from st_common import result_cmp, style_fail, style_pass, validate_cases + + +def main(): + validate_cases(CASES) + case_filter = sys.argv[1] if len(sys.argv) > 1 else None + + all_passed = True + for case in CASES: + if case_filter is not None and case["name"] != case_filter: + continue + + case_dir = case["name"] + shape = case["shape"] + vr, vc = case["valid_shape"] + + golden = np.fromfile(os.path.join(case_dir, "golden.bin"), dtype=case["dtype"]).reshape(shape) + output = np.fromfile(os.path.join(case_dir, "output.bin"), dtype=case["dtype"]).reshape(shape) + + ok = result_cmp(golden[:vr, :vc], output[:vr, :vc], case["eps"]) + if ok: + print(style_pass(f"[INFO] {case['name']}: compare passed")) + else: + print(style_fail(f"[ERROR] {case['name']}: compare failed")) + all_passed = False + + if not all_passed: + sys.exit(2) + print(style_pass("[INFO] all cases passed")) + + +if __name__ == "__main__": + main() diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trsqrt/gen_data.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trsqrt/gen_data.py new file mode 100644 index 000000000..9ca63c976 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trsqrt/gen_data.py @@ -0,0 +1,34 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import numpy as np +from cases import CASES +from st_common import validate_cases, setup_case_rng, save_case_data + +validate_cases(CASES) + +for case in CASES: + setup_case_rng(case) + + dtype = case["dtype"] + shape = case["shape"] + valid_shape = case["valid_shape"] + + # Positive values for rsqrt (1/sqrt(x) requires sqrt(x) > 0) + input = np.random.uniform(0.1, 100.0, size=shape).astype(dtype) + + # rsqrt = 1 / sqrt(x) + golden = np.zeros(shape, dtype=dtype) + vr, vc = valid_shape + golden[:vr, :vc] = np.reciprocal(np.sqrt(input[:vr, :vc])).astype(dtype, copy=False) + + save_case_data(case["name"], {"input": input, "golden": golden}) + print(f"[INFO] gen_data: {case['name']} shape={shape} valid_shape={valid_shape} dtype={dtype.__name__}") diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trsqrt/launch.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trsqrt/launch.cpp new file mode 100644 index 000000000..750e96ae7 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trsqrt/launch.cpp @@ -0,0 +1,28 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +#include + +#ifndef AICORE +#define AICORE [aicore] +#endif + +// Case 0: f32 16x64 + +extern "C" __global__ AICORE void TRSQRT_f32_16x64(__gm__ float *a, __gm__ float *b); +extern "C" __global__ AICORE void TRSQRT_f16_16x64(__gm__ uint16_t *a, __gm__ uint16_t *b); + +void LaunchTRSQRT_f16_16x64(void *a, void *b, void *stream) { + TRSQRT_f16_16x64<<<1, nullptr, stream>>>((__gm__ uint16_t *)a, (__gm__ uint16_t *)b); +} + + + +void LaunchTRSQRT_f32_16x64(void *a, void *b, void *stream) { + TRSQRT_f32_16x64<<<1, nullptr, stream>>>((__gm__ float *)a, (__gm__ float *)b); +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trsqrt/main.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trsqrt/main.cpp new file mode 100644 index 000000000..b2b37c886 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trsqrt/main.cpp @@ -0,0 +1,135 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// Host driver for TileLang trsqrt ST — case-table driven. +// Each case launches a different kernel variant, reads/writes from per-case subdirectory. +// Numerical comparison is done externally by compare.py. + +#include "acl/acl.h" +#include "test_common.h" +#include +#include +#include +#include +#include +#include + +using namespace PtoTestCommon; + +// Kernel launch wrappers (defined in launch.cpp) +void LaunchTRSQRT_f32_16x64(void *a, void *b, void *stream); +void LaunchTRSQRT_f32_32x32(void *a, void *b, void *stream); +void LaunchTRSQRT_f16_16x64(void *a, void *b, void *stream); +void LaunchTRSQRT_f16_32x32(void *a, void *b, void *stream); + +using LaunchFn = void (*)(void *, void *, void *); + +struct TestCase { + const char *name; + LaunchFn launch; + size_t rows; // allocated tile rows + size_t cols; // allocated tile cols + size_t validRows; // effective computation rows (<= rows) + size_t validCols; // effective computation cols (<= cols) + size_t elemSize; // bytes per element +}; + +static const TestCase kCases[] = { +{"f32_16x64", LaunchTRSQRT_f32_16x64, 16, 64, 16, 64, sizeof(float)}, +{"f16_16x64", LaunchTRSQRT_f16_16x64, 16, 64, 16, 64, sizeof(uint16_t)}, +}; +static constexpr size_t kNumCases = sizeof(kCases) / sizeof(kCases[0]); + +static int RunCase(const TestCase &tc, int deviceId, aclrtStream stream) { + int rc = 0; + const size_t elemCount = tc.rows * tc.cols; + const size_t fileSize = elemCount * tc.elemSize; + + std::printf("[INFO] === case: %s (shape=%zux%zu, valid=%zux%zu) ===\n", + tc.name, tc.rows, tc.cols, tc.validRows, tc.validCols); + + // Per-case data directory + std::string caseDir = std::string("./") + tc.name; + size_t srcFileSize = fileSize; + + void *srcHost = nullptr, *dstHost = nullptr; + void *srcDevice = nullptr, *dstDevice = nullptr; + + aclrtMallocHost((void **)(&srcHost), fileSize); + aclrtMallocHost((void **)(&dstHost), fileSize); + + aclrtMalloc((void **)&srcDevice, fileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc((void **)&dstDevice, fileSize, ACL_MEM_MALLOC_HUGE_FIRST); + + if (!ReadFile((caseDir + "/input.bin").c_str(), srcFileSize, srcHost, fileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input.bin\n", caseDir.c_str()); + rc = 1; + } + + if (rc == 0) { + aclrtMemcpy(srcDevice, fileSize, srcHost, fileSize, ACL_MEMCPY_HOST_TO_DEVICE); + + tc.launch(srcDevice, dstDevice, stream); + + aclrtSynchronizeStream(stream); + aclrtMemcpy(dstHost, fileSize, dstDevice, fileSize, ACL_MEMCPY_DEVICE_TO_HOST); + } + + if (rc == 0 && !WriteFile((caseDir + "/output.bin").c_str(), dstHost, fileSize)) { + std::fprintf(stderr, "[ERROR] failed to write %s/output.bin\n", caseDir.c_str()); + rc = 1; + } + + if (srcDevice != nullptr) + aclrtFree(srcDevice); + if (dstDevice != nullptr) + aclrtFree(dstDevice); + if (srcHost != nullptr) + aclrtFreeHost(srcHost); + if (dstHost != nullptr) + aclrtFreeHost(dstHost); + + if (rc == 0) + std::printf("[INFO] case %s done\n", tc.name); + return rc; +} + +int main(int argc, char *argv[]) { + // Optional case filter: ./trsqrt [case_name] + const char *caseFilter = (argc > 1) ? argv[1] : nullptr; + + int rc = 0; + int deviceId = 0; + aclrtStream stream = nullptr; + + aclInit(nullptr); + if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { + deviceId = std::atoi(envDevice); + } + aclrtSetDevice(deviceId); + aclrtCreateStream(&stream); + + for (size_t i = 0; i < kNumCases; ++i) { + if (caseFilter != nullptr && std::strcmp(kCases[i].name, caseFilter) != 0) { + continue; + } + int ret = RunCase(kCases[i], deviceId, stream); + if (ret != 0) { + std::fprintf(stderr, "[ERROR] case %s failed\n", kCases[i].name); + rc = 1; + break; + } + } + + if (stream != nullptr) + aclrtDestroyStream(stream); + aclrtResetDevice(deviceId); + aclFinalize(); + + return rc; +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/trsqrt/trsqrt.pto b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trsqrt/trsqrt.pto new file mode 100644 index 000000000..bbc6ebea4 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/trsqrt/trsqrt.pto @@ -0,0 +1,102 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// TileLang ST kernels for pto.trsqrt: 1/sqrt(x) +// trsqrt = vsqrt(x) -> vdiv(1.0, sqrt_result) +// Multiple cases with different shapes in a single module. +// Compiled by ptoas --enable-insert-sync --enable-tile-op-expand --pto-backend=vpto +// to produce a fatobj object. + +module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { + // Case 0: f32 16x64 (1024 elements) + func.func @TRSQRT_f32_16x64(%a_ptr: !pto.ptr, %b_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c16 = arith.constant 16 : index + %c64 = arith.constant 64 : index + %c1024 = arith.constant 1024 : index + + %a_view = pto.make_tensor_view %a_ptr, + shape = [%c1, %c1, %c1, %c16, %c64], + strides = [%c1024, %c1024, %c1024, %c64, %c1] + : !pto.tensor_view<1x1x1x16x64xf32> + %b_view = pto.make_tensor_view %b_ptr, + shape = [%c1, %c1, %c1, %c16, %c64], + strides = [%c1024, %c1024, %c1024, %c64, %c1] + : !pto.tensor_view<1x1x1x16x64xf32> + + %a_part = pto.partition_view %a_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c16, %c64] + : !pto.tensor_view<1x1x1x16x64xf32> -> !pto.partition_tensor_view<1x1x1x16x64xf32> + %b_part = pto.partition_view %b_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c16, %c64] + : !pto.tensor_view<1x1x1x16x64xf32> -> !pto.partition_tensor_view<1x1x1x16x64xf32> + + %a = pto.alloc_tile + : !pto.tile_buf + %b = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%a_part : !pto.partition_tensor_view<1x1x1x16x64xf32>) + outs(%a : !pto.tile_buf) + + pto.trsqrt ins(%a : !pto.tile_buf) + outs(%b : !pto.tile_buf) + + pto.tstore ins(%b : !pto.tile_buf) + outs(%b_part : !pto.partition_tensor_view<1x1x1x16x64xf32>) + return + } + + // Case 1: f32 32x32 (1024 elements) + + func.func @TRSQRT_f16_16x64(%a_ptr: !pto.ptr, %b_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c16 = arith.constant 16 : index + %c64 = arith.constant 64 : index + %c1024 = arith.constant 1024 : index + + %a_view = pto.make_tensor_view %a_ptr, + shape = [%c1, %c1, %c1, %c16, %c64], + strides = [%c1024, %c1024, %c1024, %c64, %c1] + : !pto.tensor_view<1x1x1x16x64xf16> + %b_view = pto.make_tensor_view %b_ptr, + shape = [%c1, %c1, %c1, %c16, %c64], + strides = [%c1024, %c1024, %c1024, %c64, %c1] + : !pto.tensor_view<1x1x1x16x64xf16> + + %a_part = pto.partition_view %a_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c16, %c64] + : !pto.tensor_view<1x1x1x16x64xf16> -> !pto.partition_tensor_view<1x1x1x16x64xf16> + %b_part = pto.partition_view %b_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c16, %c64] + : !pto.tensor_view<1x1x1x16x64xf16> -> !pto.partition_tensor_view<1x1x1x16x64xf16> + + %a = pto.alloc_tile + : !pto.tile_buf + %b = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%a_part : !pto.partition_tensor_view<1x1x1x16x64xf16>) + outs(%a : !pto.tile_buf) + + pto.trsqrt ins(%a : !pto.tile_buf) + outs(%b : !pto.tile_buf) + + pto.tstore ins(%b : !pto.tile_buf) + outs(%b_part : !pto.partition_tensor_view<1x1x1x16x64xf16>) + return + } + + // Case 3: f16 32x32 (1024 elements) +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsel/CMakeLists.txt b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsel/CMakeLists.txt new file mode 100644 index 000000000..9a9cd3372 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsel/CMakeLists.txt @@ -0,0 +1,9 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +pto_tilelang_vec_st(tsel) diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsel/cases.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsel/cases.py new file mode 100644 index 000000000..650412a41 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsel/cases.py @@ -0,0 +1,104 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +"""Single source of truth for tsel ST test cases. + +Each case defines: + - name: case identifier, used as subdirectory name and by main.cpp kCases[]. + - dtype: numpy dtype (e.g. np.float32). + - shape: (rows, cols) — allocated tile dimensions. + - valid_shape: (valid_rows, valid_cols) — effective computation region. + - eps: tolerance for numpy.allclose (atol and rtol). + +gen_data.py and compare.py both import this list to avoid redundant definitions. +""" + +import numpy as np + +CASES = [ + { + "name": "f32_2x128", + "dtype": np.float32, + "shape": (2, 128), + "valid_shape": (2, 128), + "eps": 1e-6, + }, + { + "name": "f32_2x32", + "dtype": np.float32, + "shape": (2, 32), + "valid_shape": (2, 32), + "eps": 1e-6, + }, + { + "name": "f32_2x160", + "dtype": np.float32, + "shape": (2, 160), + "valid_shape": (2, 160), + "eps": 1e-6, + }, + { + "name": "f32_2x512", + "dtype": np.float32, + "shape": (2, 512), + "valid_shape": (2, 512), + "eps": 1e-6, + }, + { + "name": "f16_2x128", + "dtype": np.float16, + "shape": (2, 128), + "valid_shape": (2, 128), + "eps": 1e-3, + }, + { + "name": "f16_2x32", + "dtype": np.float16, + "shape": (2, 32), + "valid_shape": (2, 32), + "eps": 1e-3, + }, + { + "name": "f16_2x160", + "dtype": np.float16, + "shape": (2, 160), + "valid_shape": (2, 160), + "eps": 1e-3, + }, + { + "name": "i8_2x128", + "dtype": np.int8, + "shape": (2, 128), + "valid_shape": (2, 128), + "eps": 0, + }, + { + "name": "i8_2x32", + "dtype": np.int8, + "shape": (2, 32), + "valid_shape": (2, 32), + "eps": 0, + }, + { + "name": "i8_2x160", + "dtype": np.int8, + "shape": (2, 160), + "valid_shape": (2, 160), + "eps": 0, + }, +] + +_SMOKE_CASE_NAMES = ['f32_2x32', 'f16_2x32'] +_SMOKE_CASE_NAME_SET = set(_SMOKE_CASE_NAMES) +_missing = [name for name in _SMOKE_CASE_NAMES if name not in {case["name"] for case in CASES}] +if _missing: + raise RuntimeError("unknown smoke case(s): " + ", ".join(_missing)) +CASES = [case for case in CASES if case["name"] in _SMOKE_CASE_NAME_SET] diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsel/compare.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsel/compare.py new file mode 100644 index 000000000..428604929 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsel/compare.py @@ -0,0 +1,49 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import os +import sys +import numpy as np + +from cases import CASES +from st_common import result_cmp, style_fail, style_pass, validate_cases + + +def main(): + validate_cases(CASES) + case_filter = sys.argv[1] if len(sys.argv) > 1 else None + + all_passed = True + for case in CASES: + if case_filter is not None and case["name"] != case_filter: + continue + + case_dir = case["name"] + shape = case["shape"] + vr, vc = case["valid_shape"] + + golden = np.fromfile(os.path.join(case_dir, "golden.bin"), dtype=case["dtype"]).reshape(shape) + output = np.fromfile(os.path.join(case_dir, "output.bin"), dtype=case["dtype"]).reshape(shape) + + ok = result_cmp(golden[:vr, :vc], output[:vr, :vc], case["eps"]) + if ok: + print(style_pass(f"[INFO] {case['name']}: compare passed")) + else: + print(style_fail(f"[ERROR] {case['name']}: compare failed")) + all_passed = False + + if not all_passed: + sys.exit(2) + print(style_pass("[INFO] all cases passed")) + + +if __name__ == "__main__": + main() diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsel/gen_data.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsel/gen_data.py new file mode 100644 index 000000000..ebaa82b70 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsel/gen_data.py @@ -0,0 +1,44 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import numpy as np +from cases import CASES +from st_common import validate_cases, setup_case_rng, save_case_data + +validate_cases(CASES) + +for case in CASES: + setup_case_rng(case) + + dtype = case["dtype"] + shape = case["shape"] + valid_shape = case["valid_shape"] + vr, vc = valid_shape + mask_cols = (vc + 7) // 8 + + src0 = np.random.randint(1, 10, size=shape).astype(dtype) + src1 = np.random.randint(1, 10, size=shape).astype(dtype) + mask = np.random.randint(0, 256, size=(vr, mask_cols), dtype=np.uint8) + + golden = np.zeros(shape, dtype=dtype) + src0_valid = src0[:vr, :vc] + src1_valid = src1[:vr, :vc] + for row in range(vr): + for packed_col in range(mask_cols): + byte = int(mask[row, packed_col]) + for bit in range(8): + col = packed_col * 8 + bit + if col >= vc: + break + golden[row, col] = src0_valid[row, col] if ((byte >> bit) & 1) else src1_valid[row, col] + + save_case_data(case["name"], {"input1": src0, "input2": src1, "input3": mask, "golden": golden}) + print(f"[INFO] gen_data: {case['name']} shape={shape} valid_shape={valid_shape} dtype={dtype.__name__}") diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsel/launch.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsel/launch.cpp new file mode 100644 index 000000000..520a825ab --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsel/launch.cpp @@ -0,0 +1,28 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +#include + +#ifndef AICORE +#define AICORE [aicore] +#endif + +// Case 0: f32 2x128 + +extern "C" __global__ AICORE void TSEL_f32_2x32(__gm__ uint8_t *mask, __gm__ float *src0, __gm__ float *src1, __gm__ float *dst); +extern "C" __global__ AICORE void TSEL_f16_2x32(__gm__ uint8_t *mask, __gm__ uint16_t *src0, __gm__ uint16_t *src1, __gm__ uint16_t *dst); + +void LaunchTSEL_f32_2x32(uint8_t *mask, float *src0, float *src1, float *dst, void *stream) { + TSEL_f32_2x32<<<1, nullptr, stream>>>((__gm__ uint8_t *)mask, (__gm__ float *)src0, (__gm__ float *)src1, (__gm__ float *)dst); +} + + + +void LaunchTSEL_f16_2x32(uint8_t *mask, uint16_t *src0, uint16_t *src1, uint16_t *dst, void *stream) { + TSEL_f16_2x32<<<1, nullptr, stream>>>((__gm__ uint8_t *)mask, (__gm__ uint16_t *)src0, (__gm__ uint16_t *)src1, (__gm__ uint16_t *)dst); +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsel/main.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsel/main.cpp new file mode 100644 index 000000000..3c0f2f51c --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsel/main.cpp @@ -0,0 +1,299 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// Host driver for TileLang tsel ST — case-table driven. +// Each case launches a different kernel variant, reads/writes from per-case subdirectory. +// Numerical comparison is done externally by compare.py. + +#include "acl/acl.h" +#include "test_common.h" +#include +#include +#include +#include +#include +#include + +using namespace PtoTestCommon; + +// Kernel launch wrappers (defined in launch.cpp) +void LaunchTSEL_f32_2x32(uint8_t *mask, float *src0, float *src1, float *dst, void *stream); +void LaunchTSEL_f32_2x512(uint8_t *mask, float *src0, float *src1, float *dst, void *stream); +void LaunchTSEL_f16_2x32(uint8_t *mask, uint16_t *src0, uint16_t *src1, uint16_t *dst, void *stream); +void LaunchTSEL_i8_2x128(uint8_t *mask, int8_t *src0, int8_t *src1, int8_t *dst, void *stream); +void LaunchTSEL_i8_2x160(uint8_t *mask, int8_t *src0, int8_t *src1, int8_t *dst, void *stream); + +enum DataType { DT_F32, DT_F16, DT_I8 }; + +using LaunchFnF32 = void (*)(uint8_t *, float *, float *, float *, void *); +using LaunchFnF16 = void (*)(uint8_t *, uint16_t *, uint16_t *, uint16_t *, void *); +using LaunchFnI8 = void (*)(uint8_t *, int8_t *, int8_t *, int8_t *, void *); + +struct TestCase { + const char *name; + DataType dtype; + LaunchFnF32 launchF32; + LaunchFnF16 launchF16; + LaunchFnI8 launchI8; + size_t rows; + size_t cols; + size_t validRows; + size_t validCols; + size_t elemSize; +}; + +static const TestCase kCases[] = { +{"f32_2x32", DT_F32, LaunchTSEL_f32_2x32, nullptr, nullptr, 2, 32, 2, 32, sizeof(float)}, +{"f16_2x32", DT_F16, nullptr, LaunchTSEL_f16_2x32, nullptr, 2, 32, 2, 32, sizeof(uint16_t)}, +}; +static constexpr size_t kNumCases = sizeof(kCases) / sizeof(kCases[0]); + +static int RunCase(const TestCase &tc, int deviceId, aclrtStream stream) { + int rc = 0; + const size_t elemCount = tc.rows * tc.cols; + const size_t fileSizeConst = elemCount * tc.elemSize; + const size_t maskCols = (tc.validCols + 7) / 8; + const size_t maskFileSizeConst = tc.validRows * maskCols * sizeof(uint8_t); + + std::printf("[INFO] === case: %s (shape=%zux%zu, valid=%zux%zu) ===\n", + tc.name, tc.rows, tc.cols, tc.validRows, tc.validCols); + + std::string caseDir = std::string("./") + tc.name; + + if (tc.dtype == DT_F32) { + uint8_t *maskHost = nullptr; + float *src0Host = nullptr, *src1Host = nullptr, *dstHost = nullptr; + uint8_t *maskDevice = nullptr; + float *src0Device = nullptr, *src1Device = nullptr, *dstDevice = nullptr; + + aclrtMallocHost((void **)(&maskHost), maskFileSizeConst); + aclrtMallocHost((void **)(&src0Host), fileSizeConst); + aclrtMallocHost((void **)(&src1Host), fileSizeConst); + aclrtMallocHost((void **)(&dstHost), fileSizeConst); + + aclrtMalloc((void **)&maskDevice, maskFileSizeConst, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc((void **)&src0Device, fileSizeConst, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc((void **)&src1Device, fileSizeConst, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc((void **)&dstDevice, fileSizeConst, ACL_MEM_MALLOC_HUGE_FIRST); + + size_t fileSize = fileSizeConst; + if (!ReadFile((caseDir + "/input1.bin").c_str(), fileSize, src0Host, fileSizeConst)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input1.bin\n", caseDir.c_str()); + rc = 1; + } + fileSize = fileSizeConst; + if (rc == 0 && !ReadFile((caseDir + "/input2.bin").c_str(), fileSize, src1Host, fileSizeConst)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input2.bin\n", caseDir.c_str()); + rc = 1; + } + size_t maskFileSize = maskFileSizeConst; + if (rc == 0 && !ReadFile((caseDir + "/input3.bin").c_str(), maskFileSize, maskHost, maskFileSizeConst)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input3.bin\n", caseDir.c_str()); + rc = 1; + } + + if (rc == 0) { + aclrtMemcpy(maskDevice, maskFileSizeConst, maskHost, maskFileSizeConst, ACL_MEMCPY_HOST_TO_DEVICE); + aclrtMemcpy(src0Device, fileSizeConst, src0Host, fileSizeConst, ACL_MEMCPY_HOST_TO_DEVICE); + aclrtMemcpy(src1Device, fileSizeConst, src1Host, fileSizeConst, ACL_MEMCPY_HOST_TO_DEVICE); + + tc.launchF32(maskDevice, src0Device, src1Device, dstDevice, stream); + + aclrtSynchronizeStream(stream); + aclrtMemcpy(dstHost, fileSizeConst, dstDevice, fileSizeConst, ACL_MEMCPY_DEVICE_TO_HOST); + } + + if (rc == 0 && !WriteFile((caseDir + "/output.bin").c_str(), dstHost, fileSizeConst)) { + std::fprintf(stderr, "[ERROR] failed to write %s/output.bin\n", caseDir.c_str()); + rc = 1; + } + + if (maskDevice != nullptr) + aclrtFree(maskDevice); + if (src0Device != nullptr) + aclrtFree(src0Device); + if (src1Device != nullptr) + aclrtFree(src1Device); + if (dstDevice != nullptr) + aclrtFree(dstDevice); + if (maskHost != nullptr) + aclrtFreeHost(maskHost); + if (src0Host != nullptr) + aclrtFreeHost(src0Host); + if (src1Host != nullptr) + aclrtFreeHost(src1Host); + if (dstHost != nullptr) + aclrtFreeHost(dstHost); + } else if (tc.dtype == DT_F16) { + uint8_t *maskHost = nullptr; + uint16_t *src0Host = nullptr, *src1Host = nullptr, *dstHost = nullptr; + uint8_t *maskDevice = nullptr; + uint16_t *src0Device = nullptr, *src1Device = nullptr, *dstDevice = nullptr; + + aclrtMallocHost((void **)(&maskHost), maskFileSizeConst); + aclrtMallocHost((void **)(&src0Host), fileSizeConst); + aclrtMallocHost((void **)(&src1Host), fileSizeConst); + aclrtMallocHost((void **)(&dstHost), fileSizeConst); + + aclrtMalloc((void **)&maskDevice, maskFileSizeConst, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc((void **)&src0Device, fileSizeConst, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc((void **)&src1Device, fileSizeConst, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc((void **)&dstDevice, fileSizeConst, ACL_MEM_MALLOC_HUGE_FIRST); + + size_t fileSize = fileSizeConst; + if (!ReadFile((caseDir + "/input1.bin").c_str(), fileSize, src0Host, fileSizeConst)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input1.bin\n", caseDir.c_str()); + rc = 1; + } + fileSize = fileSizeConst; + if (rc == 0 && !ReadFile((caseDir + "/input2.bin").c_str(), fileSize, src1Host, fileSizeConst)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input2.bin\n", caseDir.c_str()); + rc = 1; + } + size_t maskFileSize = maskFileSizeConst; + if (rc == 0 && !ReadFile((caseDir + "/input3.bin").c_str(), maskFileSize, maskHost, maskFileSizeConst)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input3.bin\n", caseDir.c_str()); + rc = 1; + } + + if (rc == 0) { + aclrtMemcpy(maskDevice, maskFileSizeConst, maskHost, maskFileSizeConst, ACL_MEMCPY_HOST_TO_DEVICE); + aclrtMemcpy(src0Device, fileSizeConst, src0Host, fileSizeConst, ACL_MEMCPY_HOST_TO_DEVICE); + aclrtMemcpy(src1Device, fileSizeConst, src1Host, fileSizeConst, ACL_MEMCPY_HOST_TO_DEVICE); + + tc.launchF16(maskDevice, src0Device, src1Device, dstDevice, stream); + + aclrtSynchronizeStream(stream); + aclrtMemcpy(dstHost, fileSizeConst, dstDevice, fileSizeConst, ACL_MEMCPY_DEVICE_TO_HOST); + } + + if (rc == 0 && !WriteFile((caseDir + "/output.bin").c_str(), dstHost, fileSizeConst)) { + std::fprintf(stderr, "[ERROR] failed to write %s/output.bin\n", caseDir.c_str()); + rc = 1; + } + + if (maskDevice != nullptr) + aclrtFree(maskDevice); + if (src0Device != nullptr) + aclrtFree(src0Device); + if (src1Device != nullptr) + aclrtFree(src1Device); + if (dstDevice != nullptr) + aclrtFree(dstDevice); + if (maskHost != nullptr) + aclrtFreeHost(maskHost); + if (src0Host != nullptr) + aclrtFreeHost(src0Host); + if (src1Host != nullptr) + aclrtFreeHost(src1Host); + if (dstHost != nullptr) + aclrtFreeHost(dstHost); + } else { + uint8_t *maskHost = nullptr; + int8_t *src0Host = nullptr, *src1Host = nullptr, *dstHost = nullptr; + uint8_t *maskDevice = nullptr; + int8_t *src0Device = nullptr, *src1Device = nullptr, *dstDevice = nullptr; + + aclrtMallocHost((void **)(&maskHost), maskFileSizeConst); + aclrtMallocHost((void **)(&src0Host), fileSizeConst); + aclrtMallocHost((void **)(&src1Host), fileSizeConst); + aclrtMallocHost((void **)(&dstHost), fileSizeConst); + + aclrtMalloc((void **)&maskDevice, maskFileSizeConst, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc((void **)&src0Device, fileSizeConst, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc((void **)&src1Device, fileSizeConst, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc((void **)&dstDevice, fileSizeConst, ACL_MEM_MALLOC_HUGE_FIRST); + + size_t fileSize = fileSizeConst; + if (!ReadFile((caseDir + "/input1.bin").c_str(), fileSize, src0Host, fileSizeConst)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input1.bin\n", caseDir.c_str()); + rc = 1; + } + fileSize = fileSizeConst; + if (rc == 0 && !ReadFile((caseDir + "/input2.bin").c_str(), fileSize, src1Host, fileSizeConst)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input2.bin\n", caseDir.c_str()); + rc = 1; + } + size_t maskFileSize = maskFileSizeConst; + if (rc == 0 && !ReadFile((caseDir + "/input3.bin").c_str(), maskFileSize, maskHost, maskFileSizeConst)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input3.bin\n", caseDir.c_str()); + rc = 1; + } + + if (rc == 0) { + aclrtMemcpy(maskDevice, maskFileSizeConst, maskHost, maskFileSizeConst, ACL_MEMCPY_HOST_TO_DEVICE); + aclrtMemcpy(src0Device, fileSizeConst, src0Host, fileSizeConst, ACL_MEMCPY_HOST_TO_DEVICE); + aclrtMemcpy(src1Device, fileSizeConst, src1Host, fileSizeConst, ACL_MEMCPY_HOST_TO_DEVICE); + + tc.launchI8(maskDevice, src0Device, src1Device, dstDevice, stream); + + aclrtSynchronizeStream(stream); + aclrtMemcpy(dstHost, fileSizeConst, dstDevice, fileSizeConst, ACL_MEMCPY_DEVICE_TO_HOST); + } + + if (rc == 0 && !WriteFile((caseDir + "/output.bin").c_str(), dstHost, fileSizeConst)) { + std::fprintf(stderr, "[ERROR] failed to write %s/output.bin\n", caseDir.c_str()); + rc = 1; + } + + if (maskDevice != nullptr) + aclrtFree(maskDevice); + if (src0Device != nullptr) + aclrtFree(src0Device); + if (src1Device != nullptr) + aclrtFree(src1Device); + if (dstDevice != nullptr) + aclrtFree(dstDevice); + if (maskHost != nullptr) + aclrtFreeHost(maskHost); + if (src0Host != nullptr) + aclrtFreeHost(src0Host); + if (src1Host != nullptr) + aclrtFreeHost(src1Host); + if (dstHost != nullptr) + aclrtFreeHost(dstHost); + } + + if (rc == 0) + std::printf("[INFO] case %s done\n", tc.name); + return rc; +} + +int main(int argc, char *argv[]) { + const char *caseFilter = (argc > 1) ? argv[1] : nullptr; + + int rc = 0; + int deviceId = 0; + aclrtStream stream = nullptr; + + aclInit(nullptr); + if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { + deviceId = std::atoi(envDevice); + } + aclrtSetDevice(deviceId); + aclrtCreateStream(&stream); + + for (size_t i = 0; i < kNumCases; ++i) { + if (caseFilter != nullptr && std::strcmp(kCases[i].name, caseFilter) != 0) { + continue; + } + int ret = RunCase(kCases[i], deviceId, stream); + if (ret != 0) { + std::fprintf(stderr, "[ERROR] case %s failed\n", kCases[i].name); + rc = 1; + break; + } + } + + if (stream != nullptr) + aclrtDestroyStream(stream); + aclrtResetDevice(deviceId); + aclFinalize(); + + return rc; +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsel/tsel.pto b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsel/tsel.pto new file mode 100644 index 000000000..e9e394491 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsel/tsel.pto @@ -0,0 +1,163 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// TileLang ST kernels for pto.tsel: packed mask tload + tload(src0) + tload(src1) + tsel(mask,src0,src1,tmp,dst) + tstore(dst). +// Multiple cases with different shapes in a single module. +// Compiled by ptoas --enable-insert-sync --enable-tile-op-expand --pto-backend=vpto +// to produce a fatobj object. + +module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { + // Case: f32 2x128 + func.func @TSEL_f32_2x32(%mask_ptr: !pto.ptr, %src0_ptr: !pto.ptr, %src1_ptr: !pto.ptr, %dst_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c2 = arith.constant 2 : index + %c4 = arith.constant 4 : index + %c8 = arith.constant 8 : index + %c32 = arith.constant 32 : index + %c64 = arith.constant 64 : index + + %mask_view = pto.make_tensor_view %mask_ptr, + shape = [%c1, %c1, %c1, %c2, %c4], + strides = [%c8, %c8, %c8, %c4, %c1] + : !pto.tensor_view<1x1x1x2x4xi8> + %src0_view = pto.make_tensor_view %src0_ptr, + shape = [%c1, %c1, %c1, %c2, %c32], + strides = [%c64, %c64, %c64, %c32, %c1] + : !pto.tensor_view<1x1x1x2x32xf32> + %src1_view = pto.make_tensor_view %src1_ptr, + shape = [%c1, %c1, %c1, %c2, %c32], + strides = [%c64, %c64, %c64, %c32, %c1] + : !pto.tensor_view<1x1x1x2x32xf32> + %dst_view = pto.make_tensor_view %dst_ptr, + shape = [%c1, %c1, %c1, %c2, %c32], + strides = [%c64, %c64, %c64, %c32, %c1] + : !pto.tensor_view<1x1x1x2x32xf32> + + %mask_part = pto.partition_view %mask_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c2, %c4] + : !pto.tensor_view<1x1x1x2x4xi8> -> !pto.partition_tensor_view<1x1x1x2x4xi8> + %src0_part = pto.partition_view %src0_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c2, %c32] + : !pto.tensor_view<1x1x1x2x32xf32> -> !pto.partition_tensor_view<1x1x1x2x32xf32> + %src1_part = pto.partition_view %src1_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c2, %c32] + : !pto.tensor_view<1x1x1x2x32xf32> -> !pto.partition_tensor_view<1x1x1x2x32xf32> + %dst_part = pto.partition_view %dst_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c2, %c32] + : !pto.tensor_view<1x1x1x2x32xf32> -> !pto.partition_tensor_view<1x1x1x2x32xf32> + + %mask = pto.alloc_tile + : !pto.tile_buf + %src0 = pto.alloc_tile + : !pto.tile_buf + %src1 = pto.alloc_tile + : !pto.tile_buf + %tmp = pto.alloc_tile + : !pto.tile_buf + %dst = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%mask_part : !pto.partition_tensor_view<1x1x1x2x4xi8>) + outs(%mask : !pto.tile_buf) + pto.tload ins(%src0_part : !pto.partition_tensor_view<1x1x1x2x32xf32>) + outs(%src0 : !pto.tile_buf) + pto.tload ins(%src1_part : !pto.partition_tensor_view<1x1x1x2x32xf32>) + outs(%src1 : !pto.tile_buf) + + pto.tsel ins(%mask, %src0, %src1, %tmp : !pto.tile_buf, + !pto.tile_buf, + !pto.tile_buf, + !pto.tile_buf) + outs(%dst : !pto.tile_buf) + + pto.tstore ins(%dst : !pto.tile_buf) + outs(%dst_part : !pto.partition_tensor_view<1x1x1x2x32xf32>) + return + } + + // Case: f32 2x160 + + func.func @TSEL_f16_2x32(%mask_ptr: !pto.ptr, %src0_ptr: !pto.ptr, %src1_ptr: !pto.ptr, %dst_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c2 = arith.constant 2 : index + %c4 = arith.constant 4 : index + %c8 = arith.constant 8 : index + %c32 = arith.constant 32 : index + %c64 = arith.constant 64 : index + + %mask_view = pto.make_tensor_view %mask_ptr, + shape = [%c1, %c1, %c1, %c2, %c4], + strides = [%c8, %c8, %c8, %c4, %c1] + : !pto.tensor_view<1x1x1x2x4xi8> + %src0_view = pto.make_tensor_view %src0_ptr, + shape = [%c1, %c1, %c1, %c2, %c32], + strides = [%c64, %c64, %c64, %c32, %c1] + : !pto.tensor_view<1x1x1x2x32xf16> + %src1_view = pto.make_tensor_view %src1_ptr, + shape = [%c1, %c1, %c1, %c2, %c32], + strides = [%c64, %c64, %c64, %c32, %c1] + : !pto.tensor_view<1x1x1x2x32xf16> + %dst_view = pto.make_tensor_view %dst_ptr, + shape = [%c1, %c1, %c1, %c2, %c32], + strides = [%c64, %c64, %c64, %c32, %c1] + : !pto.tensor_view<1x1x1x2x32xf16> + + %mask_part = pto.partition_view %mask_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c2, %c4] + : !pto.tensor_view<1x1x1x2x4xi8> -> !pto.partition_tensor_view<1x1x1x2x4xi8> + %src0_part = pto.partition_view %src0_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c2, %c32] + : !pto.tensor_view<1x1x1x2x32xf16> -> !pto.partition_tensor_view<1x1x1x2x32xf16> + %src1_part = pto.partition_view %src1_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c2, %c32] + : !pto.tensor_view<1x1x1x2x32xf16> -> !pto.partition_tensor_view<1x1x1x2x32xf16> + %dst_part = pto.partition_view %dst_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c2, %c32] + : !pto.tensor_view<1x1x1x2x32xf16> -> !pto.partition_tensor_view<1x1x1x2x32xf16> + + %mask = pto.alloc_tile + : !pto.tile_buf + %src0 = pto.alloc_tile + : !pto.tile_buf + %src1 = pto.alloc_tile + : !pto.tile_buf + %tmp = pto.alloc_tile + : !pto.tile_buf + %dst = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%mask_part : !pto.partition_tensor_view<1x1x1x2x4xi8>) + outs(%mask : !pto.tile_buf) + pto.tload ins(%src0_part : !pto.partition_tensor_view<1x1x1x2x32xf16>) + outs(%src0 : !pto.tile_buf) + pto.tload ins(%src1_part : !pto.partition_tensor_view<1x1x1x2x32xf16>) + outs(%src1 : !pto.tile_buf) + + pto.tsel ins(%mask, %src0, %src1, %tmp : !pto.tile_buf, + !pto.tile_buf, + !pto.tile_buf, + !pto.tile_buf) + outs(%dst : !pto.tile_buf) + + pto.tstore ins(%dst : !pto.tile_buf) + outs(%dst_part : !pto.partition_tensor_view<1x1x1x2x32xf16>) + return + } + + // Case: f16 2x160 +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsels/CMakeLists.txt b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsels/CMakeLists.txt new file mode 100644 index 000000000..d104d31ec --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsels/CMakeLists.txt @@ -0,0 +1,9 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +pto_tilelang_vec_st(tsels) diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsels/cases.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsels/cases.py new file mode 100644 index 000000000..2550dae22 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsels/cases.py @@ -0,0 +1,57 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +"""Single source of truth for tsels ST test cases. + +Each case defines: + - name: case identifier + - dtype: numpy dtype for data (src/dst) + - dtype_mask: numpy dtype for mask + - dst_shape: (dst_rows, dst_cols) — allocated dst tile dimensions + - mask_shape: (mask_rows, mask_cols) — allocated mask tile dimensions + - src_shape: (src_rows, src_cols) — allocated src tile dimensions + - valid_shape: (valid_rows, valid_cols) — effective computation region + - eps: tolerance for numpy.allclose (atol and rtol) +""" + +import numpy as np + +CASES = [ + {"name": "uint8_uint8_2x32_2x32_2x32_2x32", "dtype": np.uint8, "dtype_mask": np.uint8, "shape": (2, 32), "dst_shape": (2, 32), "dst_valid_shape": (2, 32), "mask_shape": (2, 32), "src_shape": (2, 32), "valid_shape": (2, 32), "eps": 0}, + {"name": "uint8_uint16_2x32_2x16_2x32_2x32", "dtype": np.uint8, "dtype_mask": np.uint16, "shape": (2, 32), "dst_shape": (2, 32), "dst_valid_shape": (2, 32), "mask_shape": (2, 16), "src_shape": (2, 32), "valid_shape": (2, 32), "eps": 0}, + {"name": "uint8_uint32_2x32_2x8_2x32_2x32", "dtype": np.uint8, "dtype_mask": np.uint32, "shape": (2, 32), "dst_shape": (2, 32), "dst_valid_shape": (2, 32), "mask_shape": (2, 8), "src_shape": (2, 32), "valid_shape": (2, 32), "eps": 0}, + {"name": "uint16_uint8_2x16_2x32_2x16_2x16", "dtype": np.uint16, "dtype_mask": np.uint8, "shape": (2, 16), "dst_shape": (2, 16), "dst_valid_shape": (2, 16), "mask_shape": (2, 32), "src_shape": (2, 16), "valid_shape": (2, 16), "eps": 0}, + {"name": "uint16_uint16_2x16_2x16_2x16_2x16", "dtype": np.uint16, "dtype_mask": np.uint16, "shape": (2, 16), "dst_shape": (2, 16), "dst_valid_shape": (2, 16), "mask_shape": (2, 16), "src_shape": (2, 16), "valid_shape": (2, 16), "eps": 0}, + {"name": "uint16_uint32_2x16_2x8_2x16_2x16", "dtype": np.uint16, "dtype_mask": np.uint32, "shape": (2, 16), "dst_shape": (2, 16), "dst_valid_shape": (2, 16), "mask_shape": (2, 8), "src_shape": (2, 16), "valid_shape": (2, 16), "eps": 0}, + {"name": "uint32_uint8_2x8_2x32_2x8_2x8", "dtype": np.uint32, "dtype_mask": np.uint8, "shape": (2, 8), "dst_shape": (2, 8), "dst_valid_shape": (2, 8), "mask_shape": (2, 32), "src_shape": (2, 8), "valid_shape": (2, 8), "eps": 0}, + {"name": "uint32_uint16_2x8_2x16_2x8_2x8", "dtype": np.uint32, "dtype_mask": np.uint16, "shape": (2, 8), "dst_shape": (2, 8), "dst_valid_shape": (2, 8), "mask_shape": (2, 16), "src_shape": (2, 8), "valid_shape": (2, 8), "eps": 0}, + {"name": "uint32_uint32_2x8_2x8_2x8_2x8", "dtype": np.uint32, "dtype_mask": np.uint32, "shape": (2, 8), "dst_shape": (2, 8), "dst_valid_shape": (2, 8), "mask_shape": (2, 8), "src_shape": (2, 8), "valid_shape": (2, 8), "eps": 0}, + {"name": "f16_uint8_2x16_2x32_2x16_2x16", "dtype": np.float16, "dtype_mask": np.uint8, "shape": (2, 16), "dst_shape": (2, 16), "dst_valid_shape": (2, 16), "mask_shape": (2, 32), "src_shape": (2, 16), "valid_shape": (2, 16), "eps": 1e-3}, + {"name": "f16_uint16_2x16_2x16_2x16_2x16", "dtype": np.float16, "dtype_mask": np.uint16, "shape": (2, 16), "dst_shape": (2, 16), "dst_valid_shape": (2, 16), "mask_shape": (2, 16), "src_shape": (2, 16), "valid_shape": (2, 16), "eps": 1e-3}, + {"name": "f16_uint32_2x16_2x8_2x16_2x16", "dtype": np.float16, "dtype_mask": np.uint32, "shape": (2, 16), "dst_shape": (2, 16), "dst_valid_shape": (2, 16), "mask_shape": (2, 8), "src_shape": (2, 16), "valid_shape": (2, 16), "eps": 1e-3}, + {"name": "f32_uint8_2x8_2x32_2x8_2x8", "dtype": np.float32, "dtype_mask": np.uint8, "shape": (2, 8), "dst_shape": (2, 8), "dst_valid_shape": (2, 8), "mask_shape": (2, 32), "src_shape": (2, 8), "valid_shape": (2, 8), "eps": 1e-6}, + {"name": "f32_uint16_2x8_2x16_2x8_2x8", "dtype": np.float32, "dtype_mask": np.uint16, "shape": (2, 8), "dst_shape": (2, 8), "dst_valid_shape": (2, 8), "mask_shape": (2, 16), "src_shape": (2, 8), "valid_shape": (2, 8), "eps": 1e-6}, + {"name": "f32_uint32_2x8_2x8_2x8_2x8", "dtype": np.float32, "dtype_mask": np.uint32, "shape": (2, 8), "dst_shape": (2, 8), "dst_valid_shape": (2, 8), "mask_shape": (2, 8), "src_shape": (2, 8), "valid_shape": (2, 8), "eps": 1e-6}, + {"name": "uint8_uint8_2x32_2x64_2x128_2x31", "dtype": np.uint8, "dtype_mask": np.uint8, "shape": (2, 32), "dst_shape": (2, 32), "dst_valid_shape": (2, 31), "mask_shape": (2, 64), "src_shape": (2, 128), "valid_shape": (2, 31), "eps": 0}, + {"name": "uint16_uint8_2x32_2x64_2x128_2x31", "dtype": np.uint16, "dtype_mask": np.uint8, "shape": (2, 32), "dst_shape": (2, 32), "dst_valid_shape": (2, 31), "mask_shape": (2, 64), "src_shape": (2, 128), "valid_shape": (2, 31), "eps": 0}, + {"name": "f32_uint8_2x32_2x64_2x128_2x31", "dtype": np.float32, "dtype_mask": np.uint8, "shape": (2, 32), "dst_shape": (2, 32), "dst_valid_shape": (2, 31), "mask_shape": (2, 64), "src_shape": (2, 128), "valid_shape": (2, 31), "eps": 1e-6}, + {"name": "uint8_uint8_32x672_32x96_32x672_32x666", "dtype": np.uint8, "dtype_mask": np.uint8, "shape": (32, 672), "dst_shape": (32, 672), "dst_valid_shape": (32, 666), "mask_shape": (32, 96), "src_shape": (32, 672), "valid_shape": (32, 666), "eps": 0}, + {"name": "f16_uint8_32x672_32x96_32x672_32x666", "dtype": np.float16, "dtype_mask": np.uint8, "shape": (32, 672), "dst_shape": (32, 672), "dst_valid_shape": (32, 666), "mask_shape": (32, 96), "src_shape": (32, 672), "valid_shape": (32, 666), "eps": 1e-3}, + {"name": "f32_uint8_32x672_32x96_32x672_32x666", "dtype": np.float32, "dtype_mask": np.uint8, "shape": (32, 672), "dst_shape": (32, 672), "dst_valid_shape": (32, 666), "mask_shape": (32, 96), "src_shape": (32, 672), "valid_shape": (32, 666), "eps": 1e-6}, + {"name": "f32_uint8_1x8192_1x4096_1x8192_1x8192", "dtype": np.float32, "dtype_mask": np.uint8, "shape": (1, 8192), "dst_shape": (1, 8192), "dst_valid_shape": (1, 8192), "mask_shape": (1, 4096), "src_shape": (1, 8192), "valid_shape": (1, 8192), "eps": 1e-6}, +] + +_SMOKE_CASE_NAMES = ['uint32_uint16_2x8_2x16_2x8_2x8', 'uint8_uint8_2x32_2x64_2x128_2x31'] +_SMOKE_CASE_NAME_SET = set(_SMOKE_CASE_NAMES) +_missing = [name for name in _SMOKE_CASE_NAMES if name not in {case["name"] for case in CASES}] +if _missing: + raise RuntimeError("unknown smoke case(s): " + ", ".join(_missing)) +CASES = [case for case in CASES if case["name"] in _SMOKE_CASE_NAME_SET] diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsels/compare.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsels/compare.py new file mode 100644 index 000000000..207326ea2 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsels/compare.py @@ -0,0 +1,49 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import os +import sys +import numpy as np + +from cases import CASES +from st_common import result_cmp, style_fail, style_pass, validate_cases + + +def main(): + validate_cases(CASES) + case_filter = sys.argv[1] if len(sys.argv) > 1 else None + + all_passed = True + for case in CASES: + if case_filter is not None and case["name"] != case_filter: + continue + + case_dir = case["name"] + dst_shape = case["dst_shape"] + vr, vc = case["valid_shape"] + + golden = np.fromfile(os.path.join(case_dir, "golden.bin"), dtype=case["dtype"]).reshape(dst_shape) + output = np.fromfile(os.path.join(case_dir, "output.bin"), dtype=case["dtype"]).reshape(dst_shape) + + ok = result_cmp(golden[:vr, :vc], output[:vr, :vc], case["eps"]) + if ok: + print(style_pass(f"[INFO] {case['name']}: compare passed")) + else: + print(style_fail(f"[ERROR] {case['name']}: compare failed")) + all_passed = False + + if not all_passed: + sys.exit(2) + print(style_pass("[INFO] all cases passed")) + + +if __name__ == "__main__": + main() diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsels/gen_data.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsels/gen_data.py new file mode 100644 index 000000000..557145267 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsels/gen_data.py @@ -0,0 +1,49 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import numpy as np +from cases import CASES +from st_common import validate_cases, setup_case_rng, save_case_data + +validate_cases(CASES) + +for case in CASES: + setup_case_rng(case) + + dtype = case["dtype"] + dtype_mask = case["dtype_mask"] + dst_shape = case["dst_shape"] + mask_shape = case["mask_shape"] + src_shape = case["src_shape"] + valid_shape = case["valid_shape"] + height, width = valid_shape + + if dtype in (np.int8, np.uint8, np.int16, np.uint16, np.int32, np.uint32): + dtype_info = np.iinfo(dtype) + input1 = np.random.randint(dtype_info.min, dtype_info.max, size=src_shape).astype(dtype) + input2 = np.random.randint(dtype_info.min, dtype_info.max, size=[1]).astype(dtype) + else: + dtype_info = np.finfo(dtype) + input1 = np.random.uniform(low=dtype_info.min, high=dtype_info.max, size=src_shape).astype(dtype) + input2 = np.random.uniform(low=dtype_info.min, high=dtype_info.max, size=[1]).astype(dtype) + + mask_dtype_info = np.iinfo(dtype_mask) + mask = np.random.randint(mask_dtype_info.min, mask_dtype_info.max, size=mask_shape).astype(dtype_mask) + mask_u8view = mask.view(np.uint8).reshape(mask_shape[0], -1) + golden = np.zeros(dst_shape, dtype=dtype) + + for y in range(height): + for x in range(width): + do_select = (1 << (x & 7)) & mask_u8view[y, x >> 3] + golden[y, x] = input1[y, x] if do_select != 0 else input2[0] + + save_case_data(case["name"], {"mask": mask, "input1": input1, "input2": input2, "golden": golden}) + print(f"[INFO] gen_data: {case['name']} dst={dst_shape} mask={mask_shape} src={src_shape} valid={valid_shape} dtype={dtype.__name__} mask_dtype={dtype_mask.__name__}") diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsels/launch.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsels/launch.cpp new file mode 100644 index 000000000..8a49cf433 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsels/launch.cpp @@ -0,0 +1,31 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +#include +#include + +#ifndef AICORE +#define AICORE [aicore] +#endif + +extern "C" __global__ AICORE void TSELS_uint32_uint16_2x8_2x16_2x8_2x8(__gm__ uint16_t *mask, __gm__ uint32_t *src, __gm__ uint32_t *dst, uint32_t scalar); +extern "C" __global__ AICORE void TSELS_uint8_uint8_2x32_2x64_2x128_2x31(__gm__ uint8_t *mask, __gm__ uint8_t *src, __gm__ uint8_t *dst, uint8_t scalar); + +void LaunchTSELS_uint32_uint16_2x8_2x16_2x8_2x8(uint16_t *mask, uint32_t *src, uint32_t *dst, void *scalar_ptr, void *stream) { + uint32_t scalar; + std::memcpy(&scalar, scalar_ptr, sizeof(uint32_t)); + TSELS_uint32_uint16_2x8_2x16_2x8_2x8<<<1, nullptr, stream>>>((__gm__ uint16_t *)mask, (__gm__ uint32_t *)src, (__gm__ uint32_t *)dst, scalar); +} + + + +void LaunchTSELS_uint8_uint8_2x32_2x64_2x128_2x31(uint8_t *mask, uint8_t *src, uint8_t *dst, void *scalar_ptr, void *stream) { + uint8_t scalar; + std::memcpy(&scalar, scalar_ptr, sizeof(uint8_t)); + TSELS_uint8_uint8_2x32_2x64_2x128_2x31<<<1, nullptr, stream>>>((__gm__ uint8_t *)mask, (__gm__ uint8_t *)src, (__gm__ uint8_t *)dst, scalar); +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsels/main.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsels/main.cpp new file mode 100644 index 000000000..045610483 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsels/main.cpp @@ -0,0 +1,150 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +#include "acl/acl.h" +#include "test_common.h" +#include +#include +#include +#include +#include +#include + +using namespace PtoTestCommon; + +void LaunchTSELS_uint8_uint16_2x32_2x16_2x32_2x32(uint16_t *mask, uint8_t *src, uint8_t *dst, void *scalar_ptr, void *stream); +void LaunchTSELS_uint16_uint8_2x16_2x32_2x16_2x16(uint8_t *mask, uint16_t *src, uint16_t *dst, void *scalar_ptr, void *stream); +void LaunchTSELS_uint16_uint32_2x16_2x8_2x16_2x16(uint32_t *mask, uint16_t *src, uint16_t *dst, void *scalar_ptr, void *stream); +void LaunchTSELS_uint32_uint16_2x8_2x16_2x8_2x8(uint16_t *mask, uint32_t *src, uint32_t *dst, void *scalar_ptr, void *stream); +void LaunchTSELS_f16_uint8_2x16_2x32_2x16_2x16(uint8_t *mask, uint16_t *src, uint16_t *dst, void *scalar_ptr, void *stream); +void LaunchTSELS_f16_uint32_2x16_2x8_2x16_2x16(uint32_t *mask, uint16_t *src, uint16_t *dst, void *scalar_ptr, void *stream); +void LaunchTSELS_f32_uint16_2x8_2x16_2x8_2x8(uint16_t *mask, float *src, float *dst, void *scalar_ptr, void *stream); +void LaunchTSELS_uint8_uint8_2x32_2x64_2x128_2x31(uint8_t *mask, uint8_t *src, uint8_t *dst, void *scalar_ptr, void *stream); +void LaunchTSELS_f32_uint8_2x32_2x64_2x128_2x31(uint8_t *mask, float *src, float *dst, void *scalar_ptr, void *stream); +void LaunchTSELS_f16_uint8_32x672_32x96_32x672_32x666(uint8_t *mask, uint16_t *src, uint16_t *dst, void *scalar_ptr, void *stream); +void LaunchTSELS_f32_uint8_1x8192_1x4096_1x8192_1x8192(uint8_t *mask, float *src, float *dst, void *scalar_ptr, void *stream); + +struct TestCase { + const char *name; + void (*launch)(void*, void*, void*, void*, void*); + size_t dstRows, dstCols; + size_t maskRows, maskCols; + size_t srcRows, srcCols; + size_t validRows, validCols; + size_t dstElemSize; + size_t maskElemSize; + size_t srcElemSize; +}; + +static const TestCase kCases[] = { +{"uint32_uint16_2x8_2x16_2x8_2x8", (void(*)(void*,void*,void*,void*,void*))LaunchTSELS_uint32_uint16_2x8_2x16_2x8_2x8, 2, 8, 2, 16, 2, 8, 2, 8, 4, 2, 4}, +{"uint8_uint8_2x32_2x64_2x128_2x31", (void(*)(void*,void*,void*,void*,void*))LaunchTSELS_uint8_uint8_2x32_2x64_2x128_2x31, 2, 32, 2, 64, 2, 128, 2, 31, 1, 1, 1}, +}; +static constexpr size_t kNumCases = sizeof(kCases) / sizeof(kCases[0]); + +static int RunCase(const TestCase &tc, int deviceId, aclrtStream stream) { + int rc = 0; + size_t dstFileSize = tc.dstRows * tc.dstCols * tc.dstElemSize; + size_t maskFileSize = tc.maskRows * tc.maskCols * tc.maskElemSize; + size_t srcFileSize = tc.srcRows * tc.srcCols * tc.srcElemSize; + size_t scalarFileSize = tc.dstElemSize; + + std::printf("[INFO] === case: %s (dst=%zux%zu, mask=%zux%zu, src=%zux%zu, valid=%zux%zu) ===\n", + tc.name, tc.dstRows, tc.dstCols, tc.maskRows, tc.maskCols, tc.srcRows, tc.srcCols, tc.validRows, tc.validCols); + + std::string caseDir = std::string("./") + tc.name; + const size_t maskFileSizeBuf = maskFileSize; + const size_t srcFileSizeBuf = srcFileSize; + const size_t scalarFileSizeBuf = scalarFileSize; + + void *maskHost = nullptr, *srcHost = nullptr, *dstHost = nullptr, *scalarHost = nullptr; + void *maskDevice = nullptr, *srcDevice = nullptr, *dstDevice = nullptr; + + aclrtMallocHost(&maskHost, maskFileSize); + aclrtMallocHost(&srcHost, srcFileSize); + aclrtMallocHost(&dstHost, dstFileSize); + aclrtMallocHost(&scalarHost, scalarFileSize); + + aclrtMalloc(&maskDevice, maskFileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc(&srcDevice, srcFileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc(&dstDevice, dstFileSize, ACL_MEM_MALLOC_HUGE_FIRST); + + memset(dstHost, 0, dstFileSize); + + if (!ReadFile(caseDir + "/mask.bin", maskFileSize, maskHost, maskFileSizeBuf)) { + std::fprintf(stderr, "[ERROR] failed to read %s/mask.bin\n", caseDir.c_str()); + rc = 1; + } + if (rc == 0 && !ReadFile(caseDir + "/input1.bin", srcFileSize, srcHost, srcFileSizeBuf)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input1.bin\n", caseDir.c_str()); + rc = 1; + } + if (rc == 0 && !ReadFile(caseDir + "/input2.bin", scalarFileSize, scalarHost, scalarFileSizeBuf)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input2.bin\n", caseDir.c_str()); + rc = 1; + } + + if (rc == 0) { + aclrtMemcpy(maskDevice, maskFileSize, maskHost, maskFileSize, ACL_MEMCPY_HOST_TO_DEVICE); + aclrtMemcpy(srcDevice, srcFileSize, srcHost, srcFileSize, ACL_MEMCPY_HOST_TO_DEVICE); + + tc.launch(maskDevice, srcDevice, dstDevice, scalarHost, stream); + + aclrtSynchronizeStream(stream); + aclrtMemcpy(dstHost, dstFileSize, dstDevice, dstFileSize, ACL_MEMCPY_DEVICE_TO_HOST); + } + + if (rc == 0 && !WriteFile((caseDir + "/output.bin").c_str(), dstHost, dstFileSize)) { + std::fprintf(stderr, "[ERROR] failed to write %s/output.bin\n", caseDir.c_str()); + rc = 1; + } + + if (maskDevice != nullptr) aclrtFree(maskDevice); + if (srcDevice != nullptr) aclrtFree(srcDevice); + if (dstDevice != nullptr) aclrtFree(dstDevice); + if (maskHost != nullptr) aclrtFreeHost(maskHost); + if (srcHost != nullptr) aclrtFreeHost(srcHost); + if (dstHost != nullptr) aclrtFreeHost(dstHost); + if (scalarHost != nullptr) aclrtFreeHost(scalarHost); + + if (rc == 0) std::printf("[INFO] case %s done\n", tc.name); + return rc; +} + +int main(int argc, char *argv[]) { + const char *caseFilter = (argc > 1) ? argv[1] : nullptr; + + int rc = 0; + int deviceId = 0; + aclrtStream stream = nullptr; + + aclInit(nullptr); + if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { + deviceId = std::atoi(envDevice); + } + aclrtSetDevice(deviceId); + aclrtCreateStream(&stream); + + for (size_t i = 0; i < kNumCases; ++i) { + if (caseFilter != nullptr && std::strcmp(kCases[i].name, caseFilter) != 0) { + continue; + } + int ret = RunCase(kCases[i], deviceId, stream); + if (ret != 0) { + std::fprintf(stderr, "[ERROR] case %s failed\n", kCases[i].name); + rc = 1; + break; + } + } + + if (stream != nullptr) aclrtDestroyStream(stream); + aclrtResetDevice(deviceId); + aclFinalize(); + + return rc; +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsels/tsels.pto b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsels/tsels.pto new file mode 100644 index 000000000..7523d7b33 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsels/tsels.pto @@ -0,0 +1,72 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// TileLang ST kernels for pto.tsels: tload(mask) + tload(src) + tsels(mask,src,tmp,scalar)->dst + tstore(dst) +// 22 cases from pto-isa tests. + +module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { + + func.func @TSELS_uint32_uint16_2x8_2x16_2x8_2x8(%mask_ptr: !pto.ptr, %src_ptr: !pto.ptr, %dst_ptr: !pto.ptr, %scalar: i32) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c2 = arith.constant 2 : index + %c8 = arith.constant 8 : index + %c16 = arith.constant 16 : index + %c32 = arith.constant 32 : index + + %mask_view = pto.make_tensor_view %mask_ptr, shape = [%c1, %c1, %c1, %c2, %c16], strides = [%c32, %c32, %c32, %c16, %c1] : !pto.tensor_view<1x1x1x2x16xi16> + %src_view = pto.make_tensor_view %src_ptr, shape = [%c1, %c1, %c1, %c2, %c8], strides = [%c16, %c16, %c16, %c8, %c1] : !pto.tensor_view<1x1x1x2x8xi32> + %dst_view = pto.make_tensor_view %dst_ptr, shape = [%c1, %c1, %c1, %c2, %c8], strides = [%c16, %c16, %c16, %c8, %c1] : !pto.tensor_view<1x1x1x2x8xi32> + + %mask_part = pto.partition_view %mask_view, offsets = [%c0, %c0, %c0, %c0, %c0], sizes = [%c1, %c1, %c1, %c2, %c16] : !pto.tensor_view<1x1x1x2x16xi16> -> !pto.partition_tensor_view<1x1x1x2x16xi16> + %src_part = pto.partition_view %src_view, offsets = [%c0, %c0, %c0, %c0, %c0], sizes = [%c1, %c1, %c1, %c2, %c8] : !pto.tensor_view<1x1x1x2x8xi32> -> !pto.partition_tensor_view<1x1x1x2x8xi32> + %dst_part = pto.partition_view %dst_view, offsets = [%c0, %c0, %c0, %c0, %c0], sizes = [%c1, %c1, %c1, %c2, %c8] : !pto.tensor_view<1x1x1x2x8xi32> -> !pto.partition_tensor_view<1x1x1x2x8xi32> + + %mask_tile = pto.alloc_tile : !pto.tile_buf + %src_tile = pto.alloc_tile : !pto.tile_buf + %tmp_tile = pto.alloc_tile : !pto.tile_buf + %dst_tile = pto.alloc_tile : !pto.tile_buf + + pto.tload ins(%mask_part : !pto.partition_tensor_view<1x1x1x2x16xi16>) outs(%mask_tile : !pto.tile_buf) + pto.tload ins(%src_part : !pto.partition_tensor_view<1x1x1x2x8xi32>) outs(%src_tile : !pto.tile_buf) + pto.tsels ins(%mask_tile, %src_tile, %tmp_tile, %scalar : !pto.tile_buf, !pto.tile_buf, !pto.tile_buf, i32) outs(%dst_tile : !pto.tile_buf) + pto.tstore ins(%dst_tile : !pto.tile_buf) outs(%dst_part : !pto.partition_tensor_view<1x1x1x2x8xi32>) + return + } + + + func.func @TSELS_uint8_uint8_2x32_2x64_2x128_2x31(%mask_ptr: !pto.ptr, %src_ptr: !pto.ptr, %dst_ptr: !pto.ptr, %scalar: i32) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c2 = arith.constant 2 : index + %c31 = arith.constant 31 : index + %c32 = arith.constant 32 : index + %c64 = arith.constant 64 : index + %c128 = arith.constant 128 : index + + %mask_view = pto.make_tensor_view %mask_ptr, shape = [%c1, %c1, %c1, %c2, %c64], strides = [%c64, %c64, %c64, %c64, %c1] : !pto.tensor_view<1x1x1x2x64xi8> + %src_view = pto.make_tensor_view %src_ptr, shape = [%c1, %c1, %c1, %c2, %c128], strides = [%c128, %c128, %c128, %c128, %c1] : !pto.tensor_view<1x1x1x2x128xi8> + %dst_view = pto.make_tensor_view %dst_ptr, shape = [%c1, %c1, %c1, %c2, %c32], strides = [%c32, %c32, %c32, %c32, %c1] : !pto.tensor_view<1x1x1x2x32xi8> + + %mask_part = pto.partition_view %mask_view, offsets = [%c0, %c0, %c0, %c0, %c0], sizes = [%c1, %c1, %c1, %c2, %c64] : !pto.tensor_view<1x1x1x2x64xi8> -> !pto.partition_tensor_view<1x1x1x2x64xi8> + %src_part = pto.partition_view %src_view, offsets = [%c0, %c0, %c0, %c0, %c0], sizes = [%c1, %c1, %c1, %c2, %c128] : !pto.tensor_view<1x1x1x2x128xi8> -> !pto.partition_tensor_view<1x1x1x2x128xi8> + %dst_part = pto.partition_view %dst_view, offsets = [%c0, %c0, %c0, %c0, %c0], sizes = [%c1, %c1, %c1, %c2, %c31] : !pto.tensor_view<1x1x1x2x32xi8> -> !pto.partition_tensor_view<1x1x1x2x31xi8> + + %mask_tile = pto.alloc_tile : !pto.tile_buf + %src_tile = pto.alloc_tile : !pto.tile_buf + %tmp_tile = pto.alloc_tile : !pto.tile_buf + %dst_tile = pto.alloc_tile : !pto.tile_buf + + pto.tload ins(%mask_part : !pto.partition_tensor_view<1x1x1x2x64xi8>) outs(%mask_tile : !pto.tile_buf) + pto.tload ins(%src_part : !pto.partition_tensor_view<1x1x1x2x128xi8>) outs(%src_tile : !pto.tile_buf) + %scalar_i8 = arith.trunci %scalar : i32 to i8 + pto.tsels ins(%mask_tile, %src_tile, %tmp_tile, %scalar_i8 : !pto.tile_buf, !pto.tile_buf, !pto.tile_buf, i8) outs(%dst_tile : !pto.tile_buf) + pto.tstore ins(%dst_tile : !pto.tile_buf) outs(%dst_part : !pto.partition_tensor_view<1x1x1x2x31xi8>) + return + } +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tshl/CMakeLists.txt b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tshl/CMakeLists.txt new file mode 100644 index 000000000..3b4f29b98 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tshl/CMakeLists.txt @@ -0,0 +1,9 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +pto_tilelang_vec_st(tshl) diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tshl/cases.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tshl/cases.py new file mode 100644 index 000000000..dc69af52d --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tshl/cases.py @@ -0,0 +1,47 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +"""Single source of truth for tshl ST test cases. + +Each case defines: + - name: case identifier, used as subdirectory name and by main.cpp kCases[]. + - dtype: numpy dtype (e.g. np.int32). + - shape: (rows, cols) — allocated tile dimensions. + - valid_shape: (valid_rows, valid_cols) — effective computation region. + - eps: tolerance for numpy.allclose (atol and rtol). + +gen_data.py and compare.py both import this list to avoid redundant definitions. +""" + +import numpy as np + +CASES = [ + { + "name": "i32_16x64", + "dtype": np.int32, + "shape": (16, 64), + "valid_shape": (16, 64), + "eps": 0, + }, + { + "name": "i32_32x32", + "dtype": np.int32, + "shape": (32, 32), + "valid_shape": (32, 32), + "eps": 0, + }, +] + +_SMOKE_CASE_NAMES = ['i32_16x64', 'i32_32x32'] +_SMOKE_CASE_NAME_SET = set(_SMOKE_CASE_NAMES) +_missing = [name for name in _SMOKE_CASE_NAMES if name not in {case["name"] for case in CASES}] +if _missing: + raise RuntimeError("unknown smoke case(s): " + ", ".join(_missing)) +CASES = [case for case in CASES if case["name"] in _SMOKE_CASE_NAME_SET] diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tshl/compare.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tshl/compare.py new file mode 100644 index 000000000..6a4d5d1aa --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tshl/compare.py @@ -0,0 +1,48 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import os +import sys +import numpy as np + +from cases import CASES +from st_common import result_cmp, style_fail, style_pass, validate_cases + + +def main(): + validate_cases(CASES) + case_filter = sys.argv[1] if len(sys.argv) > 1 else None + + all_passed = True + for case in CASES: + if case_filter is not None and case["name"] != case_filter: + continue + + case_dir = case["name"] + shape = case["shape"] + vr, vc = case["valid_shape"] + + golden = np.fromfile(os.path.join(case_dir, "golden.bin"), dtype=case["dtype"]).reshape(shape) + output = np.fromfile(os.path.join(case_dir, "output.bin"), dtype=case["dtype"]).reshape(shape) + + ok = result_cmp(golden[:vr, :vc], output[:vr, :vc], case["eps"]) + if ok: + print(style_pass(f"[INFO] {case['name']}: compare passed")) + else: + print(style_fail(f"[ERROR] {case['name']}: compare failed")) + all_passed = False + + if not all_passed: + sys.exit(2) + print(style_pass("[INFO] all cases passed")) + + +if __name__ == "__main__": + main() diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tshl/gen_data.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tshl/gen_data.py new file mode 100644 index 000000000..23b3a59d3 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tshl/gen_data.py @@ -0,0 +1,32 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import numpy as np +from cases import CASES +from st_common import validate_cases, setup_case_rng, save_case_data + +validate_cases(CASES) + +for case in CASES: + setup_case_rng(case) + + dtype = case["dtype"] + shape = case["shape"] + valid_shape = case["valid_shape"] + + input1 = np.random.randint(1, 100, size=shape).astype(dtype) + input2 = np.random.randint(0, 8, size=shape).astype(dtype) + + golden = np.zeros(shape, dtype=dtype) + vr, vc = valid_shape + golden[:vr, :vc] = (input1[:vr, :vc] << input2[:vr, :vc]).astype(dtype, copy=False) + + save_case_data(case["name"], {"input1": input1, "input2": input2, "golden": golden}) + print(f"[INFO] gen_data: {case['name']} shape={shape} valid_shape={valid_shape} dtype={dtype.__name__}") diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tshl/launch.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tshl/launch.cpp new file mode 100644 index 000000000..44e605b40 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tshl/launch.cpp @@ -0,0 +1,26 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +#include + +#ifndef AICORE +#define AICORE [aicore] +#endif + +// Case 0: i32 16x64 + +extern "C" __global__ AICORE void TSHL_i32_16x64(__gm__ int32_t *a, __gm__ int32_t *b, __gm__ int32_t *c); +extern "C" __global__ AICORE void TSHL_i32_32x32(__gm__ int32_t *a, __gm__ int32_t *b, __gm__ int32_t *c); + +void LaunchTSHL_i32_32x32(int32_t *a, int32_t *b, int32_t *c, void *stream) { + TSHL_i32_32x32<<<1, nullptr, stream>>>((__gm__ int32_t *)a, (__gm__ int32_t *)b, (__gm__ int32_t *)c); +} + +void LaunchTSHL_i32_16x64(int32_t *a, int32_t *b, int32_t *c, void *stream) { + TSHL_i32_16x64<<<1, nullptr, stream>>>((__gm__ int32_t *)a, (__gm__ int32_t *)b, (__gm__ int32_t *)c); +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tshl/main.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tshl/main.cpp new file mode 100644 index 000000000..7f14cb021 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tshl/main.cpp @@ -0,0 +1,145 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// Host driver for TileLang tshl ST — case-table driven. +// Each case launches a different kernel variant, reads/writes from per-case subdirectory. +// Numerical comparison is done externally by compare.py. + +#include "acl/acl.h" +#include "test_common.h" +#include +#include +#include +#include +#include +#include + +using namespace PtoTestCommon; + +// Kernel launch wrappers (defined in launch.cpp) +void LaunchTSHL_i32_16x64(int32_t *a, int32_t *b, int32_t *c, void *stream); +void LaunchTSHL_i32_32x32(int32_t *a, int32_t *b, int32_t *c, void *stream); + +using LaunchFn = void (*)(int32_t *, int32_t *, int32_t *, void *); + +struct TestCase { + const char *name; + LaunchFn launch; + size_t rows; // allocated tile rows + size_t cols; // allocated tile cols + size_t validRows; // effective computation rows (<= rows) + size_t validCols; // effective computation cols (<= cols) + size_t elemSize; // bytes per element +}; + +static const TestCase kCases[] = { +{"i32_16x64", LaunchTSHL_i32_16x64, 16, 64, 16, 64, sizeof(int32_t)}, +{"i32_32x32", LaunchTSHL_i32_32x32, 32, 32, 32, 32, sizeof(int32_t)}, +}; +static constexpr size_t kNumCases = sizeof(kCases) / sizeof(kCases[0]); + +static int RunCase(const TestCase &tc, int deviceId, aclrtStream stream) { + int rc = 0; + const size_t elemCount = tc.rows * tc.cols; + const size_t fileSize = elemCount * tc.elemSize; + + std::printf("[INFO] === case: %s (shape=%zux%zu, valid=%zux%zu) ===\n", + tc.name, tc.rows, tc.cols, tc.validRows, tc.validCols); + + // Per-case data directory + std::string caseDir = std::string("./") + tc.name; + size_t src0FileSize = fileSize; + size_t src1FileSize = fileSize; + + int32_t *src0Host = nullptr, *src1Host = nullptr, *dstHost = nullptr; + int32_t *src0Device = nullptr, *src1Device = nullptr, *dstDevice = nullptr; + + aclrtMallocHost((void **)(&src0Host), fileSize); + aclrtMallocHost((void **)(&src1Host), fileSize); + aclrtMallocHost((void **)(&dstHost), fileSize); + + aclrtMalloc((void **)&src0Device, fileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc((void **)&src1Device, fileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc((void **)&dstDevice, fileSize, ACL_MEM_MALLOC_HUGE_FIRST); + + if (!ReadFile((caseDir + "/input1.bin").c_str(), src0FileSize, src0Host, fileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input1.bin\n", caseDir.c_str()); + rc = 1; + } + if (rc == 0 && !ReadFile((caseDir + "/input2.bin").c_str(), src1FileSize, src1Host, fileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input2.bin\n", caseDir.c_str()); + rc = 1; + } + + if (rc == 0) { + aclrtMemcpy(src0Device, fileSize, src0Host, fileSize, ACL_MEMCPY_HOST_TO_DEVICE); + aclrtMemcpy(src1Device, fileSize, src1Host, fileSize, ACL_MEMCPY_HOST_TO_DEVICE); + + tc.launch(src0Device, src1Device, dstDevice, stream); + + aclrtSynchronizeStream(stream); + aclrtMemcpy(dstHost, fileSize, dstDevice, fileSize, ACL_MEMCPY_DEVICE_TO_HOST); + } + + if (rc == 0 && !WriteFile((caseDir + "/output.bin").c_str(), dstHost, fileSize)) { + std::fprintf(stderr, "[ERROR] failed to write %s/output.bin\n", caseDir.c_str()); + rc = 1; + } + + if (src0Device != nullptr) + aclrtFree(src0Device); + if (src1Device != nullptr) + aclrtFree(src1Device); + if (dstDevice != nullptr) + aclrtFree(dstDevice); + if (src0Host != nullptr) + aclrtFreeHost(src0Host); + if (src1Host != nullptr) + aclrtFreeHost(src1Host); + if (dstHost != nullptr) + aclrtFreeHost(dstHost); + + if (rc == 0) + std::printf("[INFO] case %s done\n", tc.name); + return rc; +} + +int main(int argc, char *argv[]) { + // Optional case filter: ./tshl [case_name] + const char *caseFilter = (argc > 1) ? argv[1] : nullptr; + + int rc = 0; + int deviceId = 0; + aclrtStream stream = nullptr; + + aclInit(nullptr); + if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { + deviceId = std::atoi(envDevice); + } + aclrtSetDevice(deviceId); + aclrtCreateStream(&stream); + + for (size_t i = 0; i < kNumCases; ++i) { + if (caseFilter != nullptr && std::strcmp(kCases[i].name, caseFilter) != 0) { + continue; + } + int ret = RunCase(kCases[i], deviceId, stream); + if (ret != 0) { + std::fprintf(stderr, "[ERROR] case %s failed\n", kCases[i].name); + rc = 1; + break; + } + } + + if (stream != nullptr) + aclrtDestroyStream(stream); + aclrtResetDevice(deviceId); + aclFinalize(); + + return rc; +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tshl/tshl.pto b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tshl/tshl.pto new file mode 100644 index 000000000..9077a8bb3 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tshl/tshl.pto @@ -0,0 +1,124 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// TileLang ST kernels for pto.tshl: tload(a) + tload(b) + tshl(a,b)->c + tstore(c). +// Multiple cases with different shapes in a single module. +// Compiled by ptoas --enable-insert-sync --enable-tile-op-expand --pto-backend=vpto +// to produce a fatobj object. + +module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { + // Case 0: i32 16x64 (1024 elements) + func.func @TSHL_i32_16x64(%a_ptr: !pto.ptr, %b_ptr: !pto.ptr, %c_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c16 = arith.constant 16 : index + %c64 = arith.constant 64 : index + %c1024 = arith.constant 1024 : index + + %a_view = pto.make_tensor_view %a_ptr, + shape = [%c1, %c1, %c1, %c16, %c64], + strides = [%c1024, %c1024, %c1024, %c64, %c1] + : !pto.tensor_view<1x1x1x16x64xi32> + %b_view = pto.make_tensor_view %b_ptr, + shape = [%c1, %c1, %c1, %c16, %c64], + strides = [%c1024, %c1024, %c1024, %c64, %c1] + : !pto.tensor_view<1x1x1x16x64xi32> + %c_view = pto.make_tensor_view %c_ptr, + shape = [%c1, %c1, %c1, %c16, %c64], + strides = [%c1024, %c1024, %c1024, %c64, %c1] + : !pto.tensor_view<1x1x1x16x64xi32> + + %a_part = pto.partition_view %a_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c16, %c64] + : !pto.tensor_view<1x1x1x16x64xi32> -> !pto.partition_tensor_view<1x1x1x16x64xi32> + %b_part = pto.partition_view %b_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c16, %c64] + : !pto.tensor_view<1x1x1x16x64xi32> -> !pto.partition_tensor_view<1x1x1x16x64xi32> + %c_part = pto.partition_view %c_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c16, %c64] + : !pto.tensor_view<1x1x1x16x64xi32> -> !pto.partition_tensor_view<1x1x1x16x64xi32> + + %a = pto.alloc_tile + : !pto.tile_buf + %b = pto.alloc_tile + : !pto.tile_buf + %c = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%a_part : !pto.partition_tensor_view<1x1x1x16x64xi32>) + outs(%a : !pto.tile_buf) + pto.tload ins(%b_part : !pto.partition_tensor_view<1x1x1x16x64xi32>) + outs(%b : !pto.tile_buf) + + pto.tshl ins(%a, %b : !pto.tile_buf, + !pto.tile_buf) + outs(%c : !pto.tile_buf) + + pto.tstore ins(%c : !pto.tile_buf) + outs(%c_part : !pto.partition_tensor_view<1x1x1x16x64xi32>) + return + } + + // Case 1: i32 32x32 (1024 elements) + + func.func @TSHL_i32_32x32(%a_ptr: !pto.ptr, %b_ptr: !pto.ptr, %c_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c32 = arith.constant 32 : index + %c1024 = arith.constant 1024 : index + + %a_view = pto.make_tensor_view %a_ptr, + shape = [%c1, %c1, %c1, %c32, %c32], + strides = [%c1024, %c1024, %c1024, %c32, %c1] + : !pto.tensor_view<1x1x1x32x32xi32> + %b_view = pto.make_tensor_view %b_ptr, + shape = [%c1, %c1, %c1, %c32, %c32], + strides = [%c1024, %c1024, %c1024, %c32, %c1] + : !pto.tensor_view<1x1x1x32x32xi32> + %c_view = pto.make_tensor_view %c_ptr, + shape = [%c1, %c1, %c1, %c32, %c32], + strides = [%c1024, %c1024, %c1024, %c32, %c1] + : !pto.tensor_view<1x1x1x32x32xi32> + + %a_part = pto.partition_view %a_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c32, %c32] + : !pto.tensor_view<1x1x1x32x32xi32> -> !pto.partition_tensor_view<1x1x1x32x32xi32> + %b_part = pto.partition_view %b_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c32, %c32] + : !pto.tensor_view<1x1x1x32x32xi32> -> !pto.partition_tensor_view<1x1x1x32x32xi32> + %c_part = pto.partition_view %c_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c32, %c32] + : !pto.tensor_view<1x1x1x32x32xi32> -> !pto.partition_tensor_view<1x1x1x32x32xi32> + + %a = pto.alloc_tile + : !pto.tile_buf + %b = pto.alloc_tile + : !pto.tile_buf + %c = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%a_part : !pto.partition_tensor_view<1x1x1x32x32xi32>) + outs(%a : !pto.tile_buf) + pto.tload ins(%b_part : !pto.partition_tensor_view<1x1x1x32x32xi32>) + outs(%b : !pto.tile_buf) + + pto.tshl ins(%a, %b : !pto.tile_buf, + !pto.tile_buf) + outs(%c : !pto.tile_buf) + + pto.tstore ins(%c : !pto.tile_buf) + outs(%c_part : !pto.partition_tensor_view<1x1x1x32x32xi32>) + return + } +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tshls/CMakeLists.txt b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tshls/CMakeLists.txt new file mode 100644 index 000000000..ae8289e40 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tshls/CMakeLists.txt @@ -0,0 +1,9 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +pto_tilelang_vec_st(tshls) diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tshls/cases.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tshls/cases.py new file mode 100644 index 000000000..30222ef97 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tshls/cases.py @@ -0,0 +1,49 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import numpy as np + +CASES = [ + { + "name": "i32_32x64", + "dtype": np.int32, + "shape": (32, 64), + "valid_shape": (32, 64), + "eps": 0, + }, + { + "name": "i16_63x64", + "dtype": np.int16, + "shape": (63, 64), + "valid_shape": (63, 64), + "eps": 0, + }, + { + "name": "i32_31x128", + "dtype": np.int32, + "shape": (31, 128), + "valid_shape": (31, 128), + "eps": 0, + }, + { + "name": "i16_15x192", + "dtype": np.int16, + "shape": (15, 192), + "valid_shape": (15, 192), + "eps": 0, + }, +] + +_SMOKE_CASE_NAMES = ['i32_32x64', 'i16_15x192'] +_SMOKE_CASE_NAME_SET = set(_SMOKE_CASE_NAMES) +_missing = [name for name in _SMOKE_CASE_NAMES if name not in {case["name"] for case in CASES}] +if _missing: + raise RuntimeError("unknown smoke case(s): " + ", ".join(_missing)) +CASES = [case for case in CASES if case["name"] in _SMOKE_CASE_NAME_SET] diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tshls/compare.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tshls/compare.py new file mode 100644 index 000000000..50186777e --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tshls/compare.py @@ -0,0 +1,46 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import os +import sys +import numpy as np + +from cases import CASES +from st_common import result_cmp, style_fail, style_pass, validate_cases + +def main(): + validate_cases(CASES) + case_filter = sys.argv[1] if len(sys.argv) > 1 else None + + all_passed = True + for case in CASES: + if case_filter is not None and case["name"] != case_filter: + continue + + case_dir = case["name"] + shape = case["shape"] + vr, vc = case["valid_shape"] + + golden = np.fromfile(os.path.join(case_dir, "golden.bin"), dtype=case["dtype"]).reshape(shape) + output = np.fromfile(os.path.join(case_dir, "output.bin"), dtype=case["dtype"]).reshape(shape) + + ok = result_cmp(golden[:vr, :vc], output[:vr, :vc], case["eps"]) + if ok: + print(style_pass(f"[INFO] {case['name']}: compare passed")) + else: + print(style_fail(f"[ERROR] {case['name']}: compare failed")) + all_passed = False + + if not all_passed: + sys.exit(2) + print(style_pass("[INFO] all cases passed")) + +if __name__ == "__main__": + main() diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tshls/gen_data.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tshls/gen_data.py new file mode 100644 index 000000000..9b4624bfc --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tshls/gen_data.py @@ -0,0 +1,34 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import numpy as np +from cases import CASES +from st_common import validate_cases, setup_case_rng, save_case_data + +# Scalar value for left shift (must match launch.cpp) +SCALAR = 2 + +validate_cases(CASES) + +for case in CASES: + setup_case_rng(case) + + dtype = case["dtype"] + shape = case["shape"] + valid_shape = case["valid_shape"] + + input1 = np.random.randint(1, 10, size=shape).astype(dtype) + + golden = np.zeros(shape, dtype=dtype) + vr, vc = valid_shape + golden[:vr, :vc] = (input1[:vr, :vc] << SCALAR).astype(dtype, copy=False) + + save_case_data(case["name"], {"input1": input1, "golden": golden}) + print(f"[INFO] gen_data: {case['name']} shape={shape} valid_shape={valid_shape} dtype={dtype.__name__} scalar={SCALAR}") diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tshls/launch.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tshls/launch.cpp new file mode 100644 index 000000000..dbd920193 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tshls/launch.cpp @@ -0,0 +1,30 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +#include + +#ifndef AICORE +#define AICORE [aicore] +#endif + +// Scalar value for left shift (must match gen_data.py SCALAR) +static constexpr int16_t TSHLS_SCALAR = 2; + +// Case 0: i32 32x64 + +extern "C" __global__ AICORE void TSHLS_i32_32x64(__gm__ int32_t *src, __gm__ int32_t *dst, int16_t scalar); +extern "C" __global__ AICORE void TSHLS_i16_15x192(__gm__ int16_t *src, __gm__ int16_t *dst, int16_t scalar); + +void LaunchTSHLS_i16_15x192(int16_t *src, int16_t *dst, void *stream) { + TSHLS_i16_15x192<<<1, nullptr, stream>>>((__gm__ int16_t *)src, (__gm__ int16_t *)dst, TSHLS_SCALAR); +} + + +void LaunchTSHLS_i32_32x64(int32_t *src, int32_t *dst, void *stream) { + TSHLS_i32_32x64<<<1, nullptr, stream>>>((__gm__ int32_t *)src, (__gm__ int32_t *)dst, TSHLS_SCALAR); +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tshls/main.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tshls/main.cpp new file mode 100644 index 000000000..6e0944bb5 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tshls/main.cpp @@ -0,0 +1,132 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// Host driver for TileLang tshls ST — case-table driven. +// tshls: dst = src << scalar (single input + scalar, left shift). +// Numerical comparison is done externally by compare.py. + +#include "acl/acl.h" +#include "test_common.h" +#include +#include +#include +#include +#include +#include + +using namespace PtoTestCommon; + +// Kernel launch wrappers (defined in launch.cpp) +void LaunchTSHLS_i32_32x64(int32_t *src, int32_t *dst, void *stream); +void LaunchTSHLS_i16_63x64(int16_t *src, int16_t *dst, void *stream); +void LaunchTSHLS_i16_15x192(int16_t *src, int16_t *dst, void *stream); + +struct TestCase { + const char *name; + void (*launch)(void *, void *, void *); // src, dst, stream + size_t rows; // allocated tile rows + size_t cols; // allocated tile cols + size_t validRows; // effective computation rows (<= rows) + size_t validCols; // effective computation cols (<= cols) + size_t elemSize; // bytes per element +}; + +static const TestCase kCases[] = { +{"i32_32x64", (void (*)(void*,void*,void*))LaunchTSHLS_i32_32x64, 32, 64, 32, 64, sizeof(int32_t)}, +{"i16_15x192", (void (*)(void*,void*,void*))LaunchTSHLS_i16_15x192, 15, 192, 15, 192, sizeof(int16_t)}, +}; +static constexpr size_t kNumCases = sizeof(kCases) / sizeof(kCases[0]); + +static int RunCase(const TestCase &tc, int deviceId, aclrtStream stream) { + int rc = 0; + const size_t elemCount = tc.rows * tc.cols; + const size_t fileSize = elemCount * tc.elemSize; + + std::printf("[INFO] === case: %s (shape=%zux%zu, valid=%zux%zu) ===\n", + tc.name, tc.rows, tc.cols, tc.validRows, tc.validCols); + + // Per-case data directory + std::string caseDir = std::string("./") + tc.name; + size_t srcFileSize = fileSize; + + void *srcHost = nullptr, *dstHost = nullptr; + void *srcDevice = nullptr, *dstDevice = nullptr; + + aclrtMallocHost(&srcHost, fileSize); + aclrtMallocHost(&dstHost, fileSize); + + aclrtMalloc(&srcDevice, fileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc(&dstDevice, fileSize, ACL_MEM_MALLOC_HUGE_FIRST); + + if (!ReadFile((caseDir + "/input1.bin").c_str(), srcFileSize, srcHost, fileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input1.bin\n", caseDir.c_str()); + rc = 1; + } + + if (rc == 0) { + aclrtMemcpy(srcDevice, fileSize, srcHost, fileSize, ACL_MEMCPY_HOST_TO_DEVICE); + + tc.launch(srcDevice, dstDevice, stream); + + aclrtSynchronizeStream(stream); + aclrtMemcpy(dstHost, fileSize, dstDevice, fileSize, ACL_MEMCPY_DEVICE_TO_HOST); + } + + if (rc == 0 && !WriteFile((caseDir + "/output.bin").c_str(), dstHost, fileSize)) { + std::fprintf(stderr, "[ERROR] failed to write %s/output.bin\n", caseDir.c_str()); + rc = 1; + } + + if (srcDevice != nullptr) + aclrtFree(srcDevice); + if (dstDevice != nullptr) + aclrtFree(dstDevice); + if (srcHost != nullptr) + aclrtFreeHost(srcHost); + if (dstHost != nullptr) + aclrtFreeHost(dstHost); + + if (rc == 0) + std::printf("[INFO] case %s done\n", tc.name); + return rc; +} + +int main(int argc, char *argv[]) { + // Optional case filter: ./tshls [case_name] + const char *caseFilter = (argc > 1) ? argv[1] : nullptr; + + int rc = 0; + int deviceId = 0; + aclrtStream stream = nullptr; + + aclInit(nullptr); + if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { + deviceId = std::atoi(envDevice); + } + aclrtSetDevice(deviceId); + aclrtCreateStream(&stream); + + for (size_t i = 0; i < kNumCases; ++i) { + if (caseFilter != nullptr && std::strcmp(kCases[i].name, caseFilter) != 0) { + continue; + } + int ret = RunCase(kCases[i], deviceId, stream); + if (ret != 0) { + std::fprintf(stderr, "[ERROR] case %s failed\n", kCases[i].name); + rc = 1; + break; + } + } + + if (stream != nullptr) + aclrtDestroyStream(stream); + aclrtResetDevice(deviceId); + aclFinalize(); + + return rc; +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tshls/tshls.pto b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tshls/tshls.pto new file mode 100644 index 000000000..34db08016 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tshls/tshls.pto @@ -0,0 +1,96 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// TileLang ST kernels for pto.tshls: tload(src) + tshls(src, scalar)->dst + tstore(dst). +// Multiple cases with different shapes/dtypes in a single module. +// Compiled by ptoas --enable-insert-sync --enable-tile-op-expand --pto-backend=vpto +// to produce a fatobj object. + +module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { + + // Case 0: i32 32x64 (2048 elements) + func.func @TSHLS_i32_32x64(%src_ptr: !pto.ptr, %dst_ptr: !pto.ptr, %scalar: i16) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c32 = arith.constant 32 : index + %c64 = arith.constant 64 : index + %c2048 = arith.constant 2048 : index + + %src_view = pto.make_tensor_view %src_ptr, + shape = [%c1, %c1, %c1, %c32, %c64], + strides = [%c2048, %c2048, %c2048, %c64, %c1] + : !pto.tensor_view<1x1x1x32x64xi32> + %dst_view = pto.make_tensor_view %dst_ptr, + shape = [%c1, %c1, %c1, %c32, %c64], + strides = [%c2048, %c2048, %c2048, %c64, %c1] + : !pto.tensor_view<1x1x1x32x64xi32> + + %src_part = pto.partition_view %src_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c32, %c64] + : !pto.tensor_view<1x1x1x32x64xi32> -> !pto.partition_tensor_view<1x1x1x32x64xi32> + %dst_part = pto.partition_view %dst_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c32, %c64] + : !pto.tensor_view<1x1x1x32x64xi32> -> !pto.partition_tensor_view<1x1x1x32x64xi32> + + %src = pto.alloc_tile + : !pto.tile_buf + %dst = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%src_part : !pto.partition_tensor_view<1x1x1x32x64xi32>) + outs(%src : !pto.tile_buf) + pto.tshls ins(%src, %scalar : !pto.tile_buf, i16) + outs(%dst : !pto.tile_buf) + pto.tstore ins(%dst : !pto.tile_buf) + outs(%dst_part : !pto.partition_tensor_view<1x1x1x32x64xi32>) + return + } + + // Case 1: i16 63x64 (4032 elements) + + func.func @TSHLS_i16_15x192(%src_ptr: !pto.ptr, %dst_ptr: !pto.ptr, %scalar: i16) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c15 = arith.constant 15 : index + %c192 = arith.constant 192 : index + %c2880 = arith.constant 2880 : index + + %src_view = pto.make_tensor_view %src_ptr, + shape = [%c1, %c1, %c1, %c15, %c192], + strides = [%c2880, %c2880, %c2880, %c192, %c1] + : !pto.tensor_view<1x1x1x15x192xi16> + %dst_view = pto.make_tensor_view %dst_ptr, + shape = [%c1, %c1, %c1, %c15, %c192], + strides = [%c2880, %c2880, %c2880, %c192, %c1] + : !pto.tensor_view<1x1x1x15x192xi16> + + %src_part = pto.partition_view %src_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c15, %c192] + : !pto.tensor_view<1x1x1x15x192xi16> -> !pto.partition_tensor_view<1x1x1x15x192xi16> + %dst_part = pto.partition_view %dst_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c15, %c192] + : !pto.tensor_view<1x1x1x15x192xi16> -> !pto.partition_tensor_view<1x1x1x15x192xi16> + + %src = pto.alloc_tile + : !pto.tile_buf + %dst = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%src_part : !pto.partition_tensor_view<1x1x1x15x192xi16>) + outs(%src : !pto.tile_buf) + pto.tshls ins(%src, %scalar : !pto.tile_buf, i16) + outs(%dst : !pto.tile_buf) + pto.tstore ins(%dst : !pto.tile_buf) + outs(%dst_part : !pto.partition_tensor_view<1x1x1x15x192xi16>) + return + } +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tshr/CMakeLists.txt b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tshr/CMakeLists.txt new file mode 100644 index 000000000..b493ac4ae --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tshr/CMakeLists.txt @@ -0,0 +1,9 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +pto_tilelang_vec_st(tshr) diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tshr/cases.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tshr/cases.py new file mode 100644 index 000000000..9dfe0ce43 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tshr/cases.py @@ -0,0 +1,47 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +"""Single source of truth for tshr ST test cases. + +Each case defines: + - name: case identifier, used as subdirectory name and by main.cpp kCases[]. + - dtype: numpy dtype (e.g. np.int32). + - shape: (rows, cols) — allocated tile dimensions. + - valid_shape: (valid_rows, valid_cols) — effective computation region. + - eps: tolerance for numpy.allclose (atol and rtol). + +gen_data.py and compare.py both import this list to avoid redundant definitions. +""" + +import numpy as np + +CASES = [ + { + "name": "i32_16x64", + "dtype": np.int32, + "shape": (16, 64), + "valid_shape": (16, 64), + "eps": 0, + }, + { + "name": "i32_32x32", + "dtype": np.int32, + "shape": (32, 32), + "valid_shape": (32, 32), + "eps": 0, + }, +] + +_SMOKE_CASE_NAMES = ['i32_16x64', 'i32_32x32'] +_SMOKE_CASE_NAME_SET = set(_SMOKE_CASE_NAMES) +_missing = [name for name in _SMOKE_CASE_NAMES if name not in {case["name"] for case in CASES}] +if _missing: + raise RuntimeError("unknown smoke case(s): " + ", ".join(_missing)) +CASES = [case for case in CASES if case["name"] in _SMOKE_CASE_NAME_SET] diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tshr/compare.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tshr/compare.py new file mode 100644 index 000000000..6a4d5d1aa --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tshr/compare.py @@ -0,0 +1,48 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import os +import sys +import numpy as np + +from cases import CASES +from st_common import result_cmp, style_fail, style_pass, validate_cases + + +def main(): + validate_cases(CASES) + case_filter = sys.argv[1] if len(sys.argv) > 1 else None + + all_passed = True + for case in CASES: + if case_filter is not None and case["name"] != case_filter: + continue + + case_dir = case["name"] + shape = case["shape"] + vr, vc = case["valid_shape"] + + golden = np.fromfile(os.path.join(case_dir, "golden.bin"), dtype=case["dtype"]).reshape(shape) + output = np.fromfile(os.path.join(case_dir, "output.bin"), dtype=case["dtype"]).reshape(shape) + + ok = result_cmp(golden[:vr, :vc], output[:vr, :vc], case["eps"]) + if ok: + print(style_pass(f"[INFO] {case['name']}: compare passed")) + else: + print(style_fail(f"[ERROR] {case['name']}: compare failed")) + all_passed = False + + if not all_passed: + sys.exit(2) + print(style_pass("[INFO] all cases passed")) + + +if __name__ == "__main__": + main() diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tshr/gen_data.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tshr/gen_data.py new file mode 100644 index 000000000..61c25fc43 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tshr/gen_data.py @@ -0,0 +1,32 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import numpy as np +from cases import CASES +from st_common import validate_cases, setup_case_rng, save_case_data + +validate_cases(CASES) + +for case in CASES: + setup_case_rng(case) + + dtype = case["dtype"] + shape = case["shape"] + valid_shape = case["valid_shape"] + + input1 = np.random.randint(1, 100, size=shape).astype(dtype) + input2 = np.random.randint(0, 8, size=shape).astype(dtype) + + golden = np.zeros(shape, dtype=dtype) + vr, vc = valid_shape + golden[:vr, :vc] = (input1[:vr, :vc] >> input2[:vr, :vc]).astype(dtype, copy=False) + + save_case_data(case["name"], {"input1": input1, "input2": input2, "golden": golden}) + print(f"[INFO] gen_data: {case['name']} shape={shape} valid_shape={valid_shape} dtype={dtype.__name__}") diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tshr/launch.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tshr/launch.cpp new file mode 100644 index 000000000..32d5a4289 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tshr/launch.cpp @@ -0,0 +1,28 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +#include + +#ifndef AICORE +#define AICORE [aicore] +#endif + +// Case 0: i32 16x64 + +extern "C" __global__ AICORE void TSHR_i32_16x64(__gm__ int32_t *a, __gm__ int32_t *b, __gm__ int32_t *c); +extern "C" __global__ AICORE void TSHR_i32_32x32(__gm__ int32_t *a, __gm__ int32_t *b, __gm__ int32_t *c); + +void LaunchTSHR_i32_16x64(int32_t *a, int32_t *b, int32_t *c, void *stream) { + TSHR_i32_16x64<<<1, nullptr, stream>>>((__gm__ int32_t *)a, (__gm__ int32_t *)b, (__gm__ int32_t *)c); +} + + + +void LaunchTSHR_i32_32x32(int32_t *a, int32_t *b, int32_t *c, void *stream) { + TSHR_i32_32x32<<<1, nullptr, stream>>>((__gm__ int32_t *)a, (__gm__ int32_t *)b, (__gm__ int32_t *)c); +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tshr/main.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tshr/main.cpp new file mode 100644 index 000000000..929215f05 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tshr/main.cpp @@ -0,0 +1,145 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// Host driver for TileLang tshr ST — case-table driven. +// Each case launches a different kernel variant, reads/writes from per-case subdirectory. +// Numerical comparison is done externally by compare.py. + +#include "acl/acl.h" +#include "test_common.h" +#include +#include +#include +#include +#include +#include + +using namespace PtoTestCommon; + +// Kernel launch wrappers (defined in launch.cpp) +void LaunchTSHR_i32_16x64(int32_t *a, int32_t *b, int32_t *c, void *stream); +void LaunchTSHR_i32_32x32(int32_t *a, int32_t *b, int32_t *c, void *stream); + +using LaunchFn = void (*)(int32_t *, int32_t *, int32_t *, void *); + +struct TestCase { + const char *name; + LaunchFn launch; + size_t rows; // allocated tile rows + size_t cols; // allocated tile cols + size_t validRows; // effective computation rows (<= rows) + size_t validCols; // effective computation cols (<= cols) + size_t elemSize; // bytes per element +}; + +static const TestCase kCases[] = { +{"i32_16x64", LaunchTSHR_i32_16x64, 16, 64, 16, 64, sizeof(int32_t)}, +{"i32_32x32", LaunchTSHR_i32_32x32, 32, 32, 32, 32, sizeof(int32_t)}, +}; +static constexpr size_t kNumCases = sizeof(kCases) / sizeof(kCases[0]); + +static int RunCase(const TestCase &tc, int deviceId, aclrtStream stream) { + int rc = 0; + const size_t elemCount = tc.rows * tc.cols; + const size_t fileSize = elemCount * tc.elemSize; + + std::printf("[INFO] === case: %s (shape=%zux%zu, valid=%zux%zu) ===\n", + tc.name, tc.rows, tc.cols, tc.validRows, tc.validCols); + + // Per-case data directory + std::string caseDir = std::string("./") + tc.name; + size_t src0FileSize = fileSize; + size_t src1FileSize = fileSize; + + int32_t *src0Host = nullptr, *src1Host = nullptr, *dstHost = nullptr; + int32_t *src0Device = nullptr, *src1Device = nullptr, *dstDevice = nullptr; + + aclrtMallocHost((void **)(&src0Host), fileSize); + aclrtMallocHost((void **)(&src1Host), fileSize); + aclrtMallocHost((void **)(&dstHost), fileSize); + + aclrtMalloc((void **)&src0Device, fileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc((void **)&src1Device, fileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc((void **)&dstDevice, fileSize, ACL_MEM_MALLOC_HUGE_FIRST); + + if (!ReadFile((caseDir + "/input1.bin").c_str(), src0FileSize, src0Host, fileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input1.bin\n", caseDir.c_str()); + rc = 1; + } + if (rc == 0 && !ReadFile((caseDir + "/input2.bin").c_str(), src1FileSize, src1Host, fileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input2.bin\n", caseDir.c_str()); + rc = 1; + } + + if (rc == 0) { + aclrtMemcpy(src0Device, fileSize, src0Host, fileSize, ACL_MEMCPY_HOST_TO_DEVICE); + aclrtMemcpy(src1Device, fileSize, src1Host, fileSize, ACL_MEMCPY_HOST_TO_DEVICE); + + tc.launch(src0Device, src1Device, dstDevice, stream); + + aclrtSynchronizeStream(stream); + aclrtMemcpy(dstHost, fileSize, dstDevice, fileSize, ACL_MEMCPY_DEVICE_TO_HOST); + } + + if (rc == 0 && !WriteFile((caseDir + "/output.bin").c_str(), dstHost, fileSize)) { + std::fprintf(stderr, "[ERROR] failed to write %s/output.bin\n", caseDir.c_str()); + rc = 1; + } + + if (src0Device != nullptr) + aclrtFree(src0Device); + if (src1Device != nullptr) + aclrtFree(src1Device); + if (dstDevice != nullptr) + aclrtFree(dstDevice); + if (src0Host != nullptr) + aclrtFreeHost(src0Host); + if (src1Host != nullptr) + aclrtFreeHost(src1Host); + if (dstHost != nullptr) + aclrtFreeHost(dstHost); + + if (rc == 0) + std::printf("[INFO] case %s done\n", tc.name); + return rc; +} + +int main(int argc, char *argv[]) { + // Optional case filter: ./tshr [case_name] + const char *caseFilter = (argc > 1) ? argv[1] : nullptr; + + int rc = 0; + int deviceId = 0; + aclrtStream stream = nullptr; + + aclInit(nullptr); + if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { + deviceId = std::atoi(envDevice); + } + aclrtSetDevice(deviceId); + aclrtCreateStream(&stream); + + for (size_t i = 0; i < kNumCases; ++i) { + if (caseFilter != nullptr && std::strcmp(kCases[i].name, caseFilter) != 0) { + continue; + } + int ret = RunCase(kCases[i], deviceId, stream); + if (ret != 0) { + std::fprintf(stderr, "[ERROR] case %s failed\n", kCases[i].name); + rc = 1; + break; + } + } + + if (stream != nullptr) + aclrtDestroyStream(stream); + aclrtResetDevice(deviceId); + aclFinalize(); + + return rc; +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tshr/tshr.pto b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tshr/tshr.pto new file mode 100644 index 000000000..fb0d2207a --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tshr/tshr.pto @@ -0,0 +1,124 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// TileLang ST kernels for pto.tshr: tload(a) + tload(b) + tshr(a,b)->c + tstore(c). +// Multiple cases with different shapes in a single module. +// Compiled by ptoas --enable-insert-sync --enable-tile-op-expand --pto-backend=vpto +// to produce a fatobj object. + +module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { + // Case 0: i32 16x64 (1024 elements) + func.func @TSHR_i32_16x64(%a_ptr: !pto.ptr, %b_ptr: !pto.ptr, %c_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c16 = arith.constant 16 : index + %c64 = arith.constant 64 : index + %c1024 = arith.constant 1024 : index + + %a_view = pto.make_tensor_view %a_ptr, + shape = [%c1, %c1, %c1, %c16, %c64], + strides = [%c1024, %c1024, %c1024, %c64, %c1] + : !pto.tensor_view<1x1x1x16x64xi32> + %b_view = pto.make_tensor_view %b_ptr, + shape = [%c1, %c1, %c1, %c16, %c64], + strides = [%c1024, %c1024, %c1024, %c64, %c1] + : !pto.tensor_view<1x1x1x16x64xi32> + %c_view = pto.make_tensor_view %c_ptr, + shape = [%c1, %c1, %c1, %c16, %c64], + strides = [%c1024, %c1024, %c1024, %c64, %c1] + : !pto.tensor_view<1x1x1x16x64xi32> + + %a_part = pto.partition_view %a_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c16, %c64] + : !pto.tensor_view<1x1x1x16x64xi32> -> !pto.partition_tensor_view<1x1x1x16x64xi32> + %b_part = pto.partition_view %b_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c16, %c64] + : !pto.tensor_view<1x1x1x16x64xi32> -> !pto.partition_tensor_view<1x1x1x16x64xi32> + %c_part = pto.partition_view %c_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c16, %c64] + : !pto.tensor_view<1x1x1x16x64xi32> -> !pto.partition_tensor_view<1x1x1x16x64xi32> + + %a = pto.alloc_tile + : !pto.tile_buf + %b = pto.alloc_tile + : !pto.tile_buf + %c = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%a_part : !pto.partition_tensor_view<1x1x1x16x64xi32>) + outs(%a : !pto.tile_buf) + pto.tload ins(%b_part : !pto.partition_tensor_view<1x1x1x16x64xi32>) + outs(%b : !pto.tile_buf) + + pto.tshr ins(%a, %b : !pto.tile_buf, + !pto.tile_buf) + outs(%c : !pto.tile_buf) + + pto.tstore ins(%c : !pto.tile_buf) + outs(%c_part : !pto.partition_tensor_view<1x1x1x16x64xi32>) + return + } + + // Case 1: i32 32x32 (1024 elements) + + func.func @TSHR_i32_32x32(%a_ptr: !pto.ptr, %b_ptr: !pto.ptr, %c_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c32 = arith.constant 32 : index + %c1024 = arith.constant 1024 : index + + %a_view = pto.make_tensor_view %a_ptr, + shape = [%c1, %c1, %c1, %c32, %c32], + strides = [%c1024, %c1024, %c1024, %c32, %c1] + : !pto.tensor_view<1x1x1x32x32xi32> + %b_view = pto.make_tensor_view %b_ptr, + shape = [%c1, %c1, %c1, %c32, %c32], + strides = [%c1024, %c1024, %c1024, %c32, %c1] + : !pto.tensor_view<1x1x1x32x32xi32> + %c_view = pto.make_tensor_view %c_ptr, + shape = [%c1, %c1, %c1, %c32, %c32], + strides = [%c1024, %c1024, %c1024, %c32, %c1] + : !pto.tensor_view<1x1x1x32x32xi32> + + %a_part = pto.partition_view %a_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c32, %c32] + : !pto.tensor_view<1x1x1x32x32xi32> -> !pto.partition_tensor_view<1x1x1x32x32xi32> + %b_part = pto.partition_view %b_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c32, %c32] + : !pto.tensor_view<1x1x1x32x32xi32> -> !pto.partition_tensor_view<1x1x1x32x32xi32> + %c_part = pto.partition_view %c_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c32, %c32] + : !pto.tensor_view<1x1x1x32x32xi32> -> !pto.partition_tensor_view<1x1x1x32x32xi32> + + %a = pto.alloc_tile + : !pto.tile_buf + %b = pto.alloc_tile + : !pto.tile_buf + %c = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%a_part : !pto.partition_tensor_view<1x1x1x32x32xi32>) + outs(%a : !pto.tile_buf) + pto.tload ins(%b_part : !pto.partition_tensor_view<1x1x1x32x32xi32>) + outs(%b : !pto.tile_buf) + + pto.tshr ins(%a, %b : !pto.tile_buf, + !pto.tile_buf) + outs(%c : !pto.tile_buf) + + pto.tstore ins(%c : !pto.tile_buf) + outs(%c_part : !pto.partition_tensor_view<1x1x1x32x32xi32>) + return + } +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tshrs/CMakeLists.txt b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tshrs/CMakeLists.txt new file mode 100644 index 000000000..c8e37c793 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tshrs/CMakeLists.txt @@ -0,0 +1,9 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +pto_tilelang_vec_st(tshrs) diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tshrs/cases.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tshrs/cases.py new file mode 100644 index 000000000..30222ef97 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tshrs/cases.py @@ -0,0 +1,49 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import numpy as np + +CASES = [ + { + "name": "i32_32x64", + "dtype": np.int32, + "shape": (32, 64), + "valid_shape": (32, 64), + "eps": 0, + }, + { + "name": "i16_63x64", + "dtype": np.int16, + "shape": (63, 64), + "valid_shape": (63, 64), + "eps": 0, + }, + { + "name": "i32_31x128", + "dtype": np.int32, + "shape": (31, 128), + "valid_shape": (31, 128), + "eps": 0, + }, + { + "name": "i16_15x192", + "dtype": np.int16, + "shape": (15, 192), + "valid_shape": (15, 192), + "eps": 0, + }, +] + +_SMOKE_CASE_NAMES = ['i32_32x64', 'i16_15x192'] +_SMOKE_CASE_NAME_SET = set(_SMOKE_CASE_NAMES) +_missing = [name for name in _SMOKE_CASE_NAMES if name not in {case["name"] for case in CASES}] +if _missing: + raise RuntimeError("unknown smoke case(s): " + ", ".join(_missing)) +CASES = [case for case in CASES if case["name"] in _SMOKE_CASE_NAME_SET] diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tshrs/compare.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tshrs/compare.py new file mode 100644 index 000000000..50186777e --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tshrs/compare.py @@ -0,0 +1,46 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import os +import sys +import numpy as np + +from cases import CASES +from st_common import result_cmp, style_fail, style_pass, validate_cases + +def main(): + validate_cases(CASES) + case_filter = sys.argv[1] if len(sys.argv) > 1 else None + + all_passed = True + for case in CASES: + if case_filter is not None and case["name"] != case_filter: + continue + + case_dir = case["name"] + shape = case["shape"] + vr, vc = case["valid_shape"] + + golden = np.fromfile(os.path.join(case_dir, "golden.bin"), dtype=case["dtype"]).reshape(shape) + output = np.fromfile(os.path.join(case_dir, "output.bin"), dtype=case["dtype"]).reshape(shape) + + ok = result_cmp(golden[:vr, :vc], output[:vr, :vc], case["eps"]) + if ok: + print(style_pass(f"[INFO] {case['name']}: compare passed")) + else: + print(style_fail(f"[ERROR] {case['name']}: compare failed")) + all_passed = False + + if not all_passed: + sys.exit(2) + print(style_pass("[INFO] all cases passed")) + +if __name__ == "__main__": + main() diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tshrs/gen_data.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tshrs/gen_data.py new file mode 100644 index 000000000..6f269f96a --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tshrs/gen_data.py @@ -0,0 +1,34 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import numpy as np +from cases import CASES +from st_common import validate_cases, setup_case_rng, save_case_data + +# Scalar value for right shift (must match launch.cpp) +SCALAR = 2 + +validate_cases(CASES) + +for case in CASES: + setup_case_rng(case) + + dtype = case["dtype"] + shape = case["shape"] + valid_shape = case["valid_shape"] + + input1 = np.random.randint(1, 10, size=shape).astype(dtype) + + golden = np.zeros(shape, dtype=dtype) + vr, vc = valid_shape + golden[:vr, :vc] = (input1[:vr, :vc] >> SCALAR).astype(dtype, copy=False) + + save_case_data(case["name"], {"input1": input1, "golden": golden}) + print(f"[INFO] gen_data: {case['name']} shape={shape} valid_shape={valid_shape} dtype={dtype.__name__} scalar={SCALAR}") diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tshrs/launch.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tshrs/launch.cpp new file mode 100644 index 000000000..50d619419 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tshrs/launch.cpp @@ -0,0 +1,30 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +#include + +#ifndef AICORE +#define AICORE [aicore] +#endif + +// Scalar value for right shift (must match gen_data.py SCALAR) +static constexpr int16_t TSHRS_SCALAR = 2; + +// Case 0: i32 32x64 + +extern "C" __global__ AICORE void TSHRS_i32_32x64(__gm__ int32_t *src, __gm__ int32_t *dst, int16_t scalar); +extern "C" __global__ AICORE void TSHRS_i16_15x192(__gm__ int16_t *src, __gm__ int16_t *dst, int16_t scalar); + +void LaunchTSHRS_i16_15x192(int16_t *src, int16_t *dst, void *stream) { + TSHRS_i16_15x192<<<1, nullptr, stream>>>((__gm__ int16_t *)src, (__gm__ int16_t *)dst, TSHRS_SCALAR); +} + + +void LaunchTSHRS_i32_32x64(int32_t *src, int32_t *dst, void *stream) { + TSHRS_i32_32x64<<<1, nullptr, stream>>>((__gm__ int32_t *)src, (__gm__ int32_t *)dst, TSHRS_SCALAR); +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tshrs/main.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tshrs/main.cpp new file mode 100644 index 000000000..23e738f04 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tshrs/main.cpp @@ -0,0 +1,132 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// Host driver for TileLang tshrs ST — case-table driven. +// tshrs: dst = src >> scalar (single input + scalar, right shift). +// Numerical comparison is done externally by compare.py. + +#include "acl/acl.h" +#include "test_common.h" +#include +#include +#include +#include +#include +#include + +using namespace PtoTestCommon; + +// Kernel launch wrappers (defined in launch.cpp) +void LaunchTSHRS_i32_32x64(int32_t *src, int32_t *dst, void *stream); +void LaunchTSHRS_i16_63x64(int16_t *src, int16_t *dst, void *stream); +void LaunchTSHRS_i16_15x192(int16_t *src, int16_t *dst, void *stream); + +struct TestCase { + const char *name; + void (*launch)(void *, void *, void *); // src, dst, stream + size_t rows; // allocated tile rows + size_t cols; // allocated tile cols + size_t validRows; // effective computation rows (<= rows) + size_t validCols; // effective computation cols (<= cols) + size_t elemSize; // bytes per element +}; + +static const TestCase kCases[] = { +{"i32_32x64", (void (*)(void*,void*,void*))LaunchTSHRS_i32_32x64, 32, 64, 32, 64, sizeof(int32_t)}, +{"i16_15x192", (void (*)(void*,void*,void*))LaunchTSHRS_i16_15x192, 15, 192, 15, 192, sizeof(int16_t)}, +}; +static constexpr size_t kNumCases = sizeof(kCases) / sizeof(kCases[0]); + +static int RunCase(const TestCase &tc, int deviceId, aclrtStream stream) { + int rc = 0; + const size_t elemCount = tc.rows * tc.cols; + const size_t fileSize = elemCount * tc.elemSize; + + std::printf("[INFO] === case: %s (shape=%zux%zu, valid=%zux%zu) ===\n", + tc.name, tc.rows, tc.cols, tc.validRows, tc.validCols); + + // Per-case data directory + std::string caseDir = std::string("./") + tc.name; + size_t srcFileSize = fileSize; + + void *srcHost = nullptr, *dstHost = nullptr; + void *srcDevice = nullptr, *dstDevice = nullptr; + + aclrtMallocHost(&srcHost, fileSize); + aclrtMallocHost(&dstHost, fileSize); + + aclrtMalloc(&srcDevice, fileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc(&dstDevice, fileSize, ACL_MEM_MALLOC_HUGE_FIRST); + + if (!ReadFile((caseDir + "/input1.bin").c_str(), srcFileSize, srcHost, fileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input1.bin\n", caseDir.c_str()); + rc = 1; + } + + if (rc == 0) { + aclrtMemcpy(srcDevice, fileSize, srcHost, fileSize, ACL_MEMCPY_HOST_TO_DEVICE); + + tc.launch(srcDevice, dstDevice, stream); + + aclrtSynchronizeStream(stream); + aclrtMemcpy(dstHost, fileSize, dstDevice, fileSize, ACL_MEMCPY_DEVICE_TO_HOST); + } + + if (rc == 0 && !WriteFile((caseDir + "/output.bin").c_str(), dstHost, fileSize)) { + std::fprintf(stderr, "[ERROR] failed to write %s/output.bin\n", caseDir.c_str()); + rc = 1; + } + + if (srcDevice != nullptr) + aclrtFree(srcDevice); + if (dstDevice != nullptr) + aclrtFree(dstDevice); + if (srcHost != nullptr) + aclrtFreeHost(srcHost); + if (dstHost != nullptr) + aclrtFreeHost(dstHost); + + if (rc == 0) + std::printf("[INFO] case %s done\n", tc.name); + return rc; +} + +int main(int argc, char *argv[]) { + // Optional case filter: ./tshrs [case_name] + const char *caseFilter = (argc > 1) ? argv[1] : nullptr; + + int rc = 0; + int deviceId = 0; + aclrtStream stream = nullptr; + + aclInit(nullptr); + if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { + deviceId = std::atoi(envDevice); + } + aclrtSetDevice(deviceId); + aclrtCreateStream(&stream); + + for (size_t i = 0; i < kNumCases; ++i) { + if (caseFilter != nullptr && std::strcmp(kCases[i].name, caseFilter) != 0) { + continue; + } + int ret = RunCase(kCases[i], deviceId, stream); + if (ret != 0) { + std::fprintf(stderr, "[ERROR] case %s failed\n", kCases[i].name); + rc = 1; + break; + } + } + + if (stream != nullptr) + aclrtDestroyStream(stream); + aclrtResetDevice(deviceId); + aclFinalize(); + + return rc; +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tshrs/tshrs.pto b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tshrs/tshrs.pto new file mode 100644 index 000000000..015c7b18d --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tshrs/tshrs.pto @@ -0,0 +1,96 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// TileLang ST kernels for pto.tshrs: tload(src) + tshrs(src, scalar)->dst + tstore(dst). +// Multiple cases with different shapes/dtypes in a single module. +// Compiled by ptoas --enable-insert-sync --enable-tile-op-expand --pto-backend=vpto +// to produce a fatobj object. + +module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { + + // Case 0: i32 32x64 (2048 elements) + func.func @TSHRS_i32_32x64(%src_ptr: !pto.ptr, %dst_ptr: !pto.ptr, %scalar: i16) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c32 = arith.constant 32 : index + %c64 = arith.constant 64 : index + %c2048 = arith.constant 2048 : index + + %src_view = pto.make_tensor_view %src_ptr, + shape = [%c1, %c1, %c1, %c32, %c64], + strides = [%c2048, %c2048, %c2048, %c64, %c1] + : !pto.tensor_view<1x1x1x32x64xi32> + %dst_view = pto.make_tensor_view %dst_ptr, + shape = [%c1, %c1, %c1, %c32, %c64], + strides = [%c2048, %c2048, %c2048, %c64, %c1] + : !pto.tensor_view<1x1x1x32x64xi32> + + %src_part = pto.partition_view %src_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c32, %c64] + : !pto.tensor_view<1x1x1x32x64xi32> -> !pto.partition_tensor_view<1x1x1x32x64xi32> + %dst_part = pto.partition_view %dst_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c32, %c64] + : !pto.tensor_view<1x1x1x32x64xi32> -> !pto.partition_tensor_view<1x1x1x32x64xi32> + + %src = pto.alloc_tile + : !pto.tile_buf + %dst = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%src_part : !pto.partition_tensor_view<1x1x1x32x64xi32>) + outs(%src : !pto.tile_buf) + pto.tshrs ins(%src, %scalar : !pto.tile_buf, i16) + outs(%dst : !pto.tile_buf) + pto.tstore ins(%dst : !pto.tile_buf) + outs(%dst_part : !pto.partition_tensor_view<1x1x1x32x64xi32>) + return + } + + // Case 1: i16 63x64 (4032 elements) + + func.func @TSHRS_i16_15x192(%src_ptr: !pto.ptr, %dst_ptr: !pto.ptr, %scalar: i16) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c15 = arith.constant 15 : index + %c192 = arith.constant 192 : index + %c2880 = arith.constant 2880 : index + + %src_view = pto.make_tensor_view %src_ptr, + shape = [%c1, %c1, %c1, %c15, %c192], + strides = [%c2880, %c2880, %c2880, %c192, %c1] + : !pto.tensor_view<1x1x1x15x192xi16> + %dst_view = pto.make_tensor_view %dst_ptr, + shape = [%c1, %c1, %c1, %c15, %c192], + strides = [%c2880, %c2880, %c2880, %c192, %c1] + : !pto.tensor_view<1x1x1x15x192xi16> + + %src_part = pto.partition_view %src_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c15, %c192] + : !pto.tensor_view<1x1x1x15x192xi16> -> !pto.partition_tensor_view<1x1x1x15x192xi16> + %dst_part = pto.partition_view %dst_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c15, %c192] + : !pto.tensor_view<1x1x1x15x192xi16> -> !pto.partition_tensor_view<1x1x1x15x192xi16> + + %src = pto.alloc_tile + : !pto.tile_buf + %dst = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%src_part : !pto.partition_tensor_view<1x1x1x15x192xi16>) + outs(%src : !pto.tile_buf) + pto.tshrs ins(%src, %scalar : !pto.tile_buf, i16) + outs(%dst : !pto.tile_buf) + pto.tstore ins(%dst : !pto.tile_buf) + outs(%dst_part : !pto.partition_tensor_view<1x1x1x15x192xi16>) + return + } +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsort32/CMakeLists.txt b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsort32/CMakeLists.txt new file mode 100644 index 000000000..ae38393ac --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsort32/CMakeLists.txt @@ -0,0 +1,9 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +pto_tilelang_vec_st(tsort32) diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsort32/cases.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsort32/cases.py new file mode 100644 index 000000000..9029c2ec7 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsort32/cases.py @@ -0,0 +1,206 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +"""Single source of truth for tsort32 ST test cases. + +Each case defines: + - name: case identifier, used as subdirectory name and by main.cpp kCases[]. + - dtype: numpy dtype (e.g. np.float32). + - src_shape: (rows, cols) — allocated source tile dimensions. + - idx_shape: (rows, cols) — allocated index tile dimensions (can be 1 x cols for shared idx). + - tmp_shape: (rows, cols) — allocated tmp tile dimensions (optional, only for unaligned cases). + None for aligned cases (valid_cols % 32 == 0). + For unaligned cases: tmp_rows = 1, tmp_cols = ceil(valid_cols, 32). + - dst_shape: (rows, cols) — allocated destination tile dimensions. + For f32: dst_cols = src_cols * 4 (buffer allocation, but valid region is src_cols * 2). + For f16: dst_cols = src_cols * 2. + - valid_shape: (valid_rows, valid_cols) — effective computation region. + For aligned cases: valid_cols must be multiple of 32 (BLOCK_SIZE). + For unaligned cases: valid_cols can be any value (requires tmp). + - idx_vshape: (idx_valid_rows, idx_valid_cols) — idx valid region. + If idx_valid_rows == 1, same idx is used for all rows. + - dst_vshape: (dst_valid_rows, dst_valid_cols) — dst valid region. + For f32: dst_vcols = src_vcols * 2 (stride coef = 2, interleaved value+index). + - eps: tolerance for numpy.allclose (atol and rtol). + +tsort32 semantics: + - Sorts data in 32-element blocks using vbitsort. + - Output format: interleaved (sorted_value, original_index) pairs with stride coef = 2. + - For each 32-element block, the output contains sorted values and their original indices. + - Each pair occupies 2 element positions: [value0, idx0, value1, idx1, ...] + +gen_data.py and compare.py both import this list to avoid redundant definitions. +""" + +import numpy as np + +CASES = [ + # f32 cases - basic shapes (aligned, no tmp needed) + { + "name": "f32_1x32", + "dtype": np.float32, + "src_shape": (1, 32), + "idx_shape": (1, 32), + "tmp_shape": None, # aligned: valid_cols % 32 == 0, no tmp + "dst_shape": (1, 128), # buffer allocation (src_cols * 4) + "valid_shape": (1, 32), + "idx_vshape": (1, 32), + "dst_vshape": (1, 64), # actual valid output: src_cols * stride_coef = 32 * 2 + "eps": 1e-6, + }, + { + "name": "f32_1x64", + "dtype": np.float32, + "src_shape": (1, 64), + "idx_shape": (1, 64), + "tmp_shape": None, # aligned: valid_cols % 32 == 0, no tmp + "dst_shape": (1, 256), # buffer allocation (src_cols * 4) + "valid_shape": (1, 64), + "idx_vshape": (1, 64), + "dst_vshape": (1, 128), # actual valid output: src_cols * stride_coef = 64 * 2 + "eps": 1e-6, + }, + # f32 cases - multiple rows (aligned, no tmp needed) + { + "name": "f32_2x32", + "dtype": np.float32, + "src_shape": (2, 32), + "idx_shape": (2, 32), + "tmp_shape": None, # aligned: valid_cols % 32 == 0, no tmp + "dst_shape": (2, 128), # buffer allocation (src_cols * 4) + "valid_shape": (2, 32), + "idx_vshape": (2, 32), + "dst_vshape": (2, 64), # actual valid output: src_cols * stride_coef = 32 * 2 + "eps": 1e-6, + }, + { + "name": "f32_16x32", + "dtype": np.float32, + "src_shape": (16, 32), + "idx_shape": (16, 32), + "tmp_shape": None, # aligned: valid_cols % 32 == 0, no tmp + "dst_shape": (16, 128), # buffer allocation (src_cols * 4) + "valid_shape": (16, 32), + "idx_vshape": (16, 32), + "dst_vshape": (16, 64), # actual valid output: src_cols * stride_coef = 32 * 2 + "eps": 1e-6, + }, + # f32 cases - shared idx (aligned, no tmp needed) + { + "name": "f32_2x64_shared_idx", + "dtype": np.float32, + "src_shape": (2, 64), + "idx_shape": (1, 64), # shared idx for all rows + "tmp_shape": None, # aligned: valid_cols % 32 == 0, no tmp + "dst_shape": (2, 256), # buffer allocation (src_cols * 4) + "valid_shape": (2, 64), + "idx_vshape": (1, 64), # idx_valid_rows = 1 means shared idx + "dst_vshape": (2, 128), # actual valid output: src_cols * stride_coef = 64 * 2 + "eps": 1e-6, + }, + { + "name": "f32_16x64_shared_idx", + "dtype": np.float32, + "src_shape": (16, 64), + "idx_shape": (1, 64), # shared idx for all rows + "tmp_shape": None, # aligned: valid_cols % 32 == 0, no tmp + "dst_shape": (16, 256), # buffer allocation (src_cols * 4) + "valid_shape": (16, 64), + "idx_vshape": (1, 64), # idx_valid_rows = 1 means shared idx + "dst_vshape": (16, 128), # actual valid output: src_cols * stride_coef = 64 * 2 + "eps": 1e-6, + }, + # f32 cases - large shape (multiple vbitsort calls, aligned, no tmp needed) + { + "name": "f32_1x8192", + "dtype": np.float32, + "src_shape": (1, 8192), # 256 * 32, requires loop_num > 1 + "idx_shape": (1, 8192), + "tmp_shape": None, # aligned: valid_cols % 32 == 0, no tmp + "dst_shape": (1, 32768), # buffer allocation (src_cols * 4) + "valid_shape": (1, 8192), + "idx_vshape": (1, 8192), + "dst_vshape": (1, 16384), # actual valid output: src_cols * stride_coef = 8192 * 2 + "eps": 1e-6, + }, + # f32 cases - non-32-aligned (requires tmp buffer for padding) + # Case 4 from C++: VALID_C=13, requires padding to 32-element block + { + "name": "f32_2x13", + "dtype": np.float32, + "src_shape": (2, 16), # ALIGN_C = ceil(13*4, 32) / 4 = 16 + "idx_shape": (2, 16), + "tmp_shape": (1, 16), # unaligned: tmp_cols = ceil(13, 32) = 16 + "dst_shape": (2, 64), # 4 * ALIGN_C = 64 + "valid_shape": (2, 13), # non-32-aligned + "idx_vshape": (2, 13), + "dst_vshape": (2, 26), # VALID_C * stride_coef = 13 * 2 + "eps": 1e-6, + }, + # Case 5 from C++: VALID_C=4164, large non-aligned shape + { + "name": "f32_1x4164", + "dtype": np.float32, + "src_shape": (1, 8192), # ALIGN_C = 8192 (from C++ hardcoded) + "idx_shape": (1, 8192), + "tmp_shape": (1, 4168), # unaligned: tmp_cols = ceil(4164, 32) = 4168 + "dst_shape": (1, 32768), # 4 * ALIGN_C = 32768 + "valid_shape": (1, 4164), # non-32-aligned + "idx_vshape": (1, 4164), + "dst_vshape": (1, 8328), # VALID_C * stride_coef = 4164 * 2 + "eps": 1e-6, + }, + # Case 6 from C++: VALID_C=2084, multi-row non-aligned shape + { + "name": "f32_2x2084", + "dtype": np.float32, + "src_shape": (2, 3072), # ALIGN_C = 3072 (from C++ hardcoded) + "idx_shape": (2, 3072), + "tmp_shape": (1, 2088), # unaligned: tmp_cols = ceil(2084, 32) = 2088 + "dst_shape": (2, 12288), # 4 * ALIGN_C = 12288 + "valid_shape": (2, 2084), # non-32-aligned + "idx_vshape": (2, 2084), + "dst_vshape": (2, 4168), # VALID_C * stride_coef = 2084 * 2 + "eps": 1e-6, + }, + # f16 cases - basic shapes (aligned, no tmp needed) + { + "name": "f16_1x32", + "dtype": np.float16, + "src_shape": (1, 32), + "idx_shape": (1, 32), + "tmp_shape": None, # aligned: valid_cols % 32 == 0, no tmp + "dst_shape": (1, 128), # buffer allocation (src_cols * 4 for f16) + "valid_shape": (1, 32), + "idx_vshape": (1, 32), + "dst_vshape": (1, 128), # actual valid output: src_cols * stride_coef = 32 * 4 + "eps": 1e-3, + }, + { + "name": "f16_4x64", + "dtype": np.float16, + "src_shape": (4, 64), + "idx_shape": (4, 64), + "tmp_shape": None, # aligned: valid_cols % 32 == 0, no tmp + "dst_shape": (4, 256), # buffer allocation (src_cols * 4 for f16) + "valid_shape": (4, 64), + "idx_vshape": (4, 64), + "dst_vshape": (4, 256), # actual valid output: src_cols * stride_coef = 64 * 4 + "eps": 1e-3, + }, +] + +_SMOKE_CASE_NAMES = ['f32_2x13', 'f16_1x32'] +_SMOKE_CASE_NAME_SET = set(_SMOKE_CASE_NAMES) +_missing = [name for name in _SMOKE_CASE_NAMES if name not in {case["name"] for case in CASES}] +if _missing: + raise RuntimeError("unknown smoke case(s): " + ", ".join(_missing)) +CASES = [case for case in CASES if case["name"] in _SMOKE_CASE_NAME_SET] diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsort32/compare.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsort32/compare.py new file mode 100644 index 000000000..f28e65586 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsort32/compare.py @@ -0,0 +1,54 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import os +import sys +import numpy as np + +# Add parent directory to path for st_common import +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +from st_common import result_cmp, style_fail, style_pass + +from cases import CASES + + +def main(): + case_filter = sys.argv[1] if len(sys.argv) > 1 else None + + all_passed = True + for case in CASES: + if case_filter is not None and case["name"] != case_filter: + continue + + case_dir = case["name"] + dtype = case["dtype"] + src_shape = case["src_shape"] + dst_shape = case["dst_shape"] + dst_vr, dst_vc = case["dst_vshape"] + + golden = np.fromfile(os.path.join(case_dir, "golden.bin"), dtype=dtype).reshape(dst_shape) + output = np.fromfile(os.path.join(case_dir, "output.bin"), dtype=dtype).reshape(dst_shape) + + # Compare only the dst valid region + ok = result_cmp(golden[:dst_vr, :dst_vc], output[:dst_vr, :dst_vc], case["eps"]) + if ok: + print(style_pass(f"[INFO] {case['name']}: compare passed")) + else: + print(style_fail(f"[ERROR] {case['name']}: compare failed")) + all_passed = False + + if not all_passed: + sys.exit(2) + print(style_pass("[INFO] all cases passed")) + + +if __name__ == "__main__": + main() diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsort32/gen_data.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsort32/gen_data.py new file mode 100644 index 000000000..1667ac098 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsort32/gen_data.py @@ -0,0 +1,140 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import numpy as np +import os +import sys + +# Add parent directory to path for st_common import +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +from st_common import setup_case_rng, save_case_data + +from cases import CASES + +BLOCK_SIZE = 32 +FLOAT_DST_STRIDE_COEF = 2 # for f32 +HALF_DST_STRIDE_COEF = 4 # for f16 + + +def _to_tuple(shape): + """Convert shape to tuple if needed.""" + if isinstance(shape, tuple): + return shape + return tuple(shape) + + +def get_stride_coef(dtype): + """Get stride coefficient based on dtype.""" + if dtype == np.float16: + return HALF_DST_STRIDE_COEF + return FLOAT_DST_STRIDE_COEF + + +for case in CASES: + setup_case_rng(case) + + dtype = case["dtype"] + src_shape = _to_tuple(case["src_shape"]) + idx_shape = _to_tuple(case["idx_shape"]) + dst_shape = _to_tuple(case["dst_shape"]) + src_valid = _to_tuple(case["valid_shape"]) + idx_valid = _to_tuple(case["idx_vshape"]) + + src_rows, src_cols = src_shape + src_vr, src_vc = src_valid + idx_vr, idx_vc = idx_valid + + # Generate random input data + input_data = np.random.randint(1, 100, size=src_shape).astype(dtype) + + # Generate index data (0, 1, 2, ... for each row) + # If idx_valid_rows == 1, same index is used for all rows + if idx_vr == 1: + idx_data = np.arange(src_cols, dtype=np.int32).reshape(1, src_cols) + else: + idx_data = np.arange(src_cols, dtype=np.int32).reshape(1, src_cols) + idx_data = np.tile(idx_data, (src_rows, 1)) + + # Compute golden: for each 32-element block, sort and output interleaved (value, index) + # Output stride coef depends on dtype: + # - f32 uses stride_coef=2 (value+index pair occupies 2 f32 elements) + # - f16 uses stride_coef=4 (value occupies 1 f16, index stored as ui32 = 4 f16 positions) + stride_coef = get_stride_coef(dtype) + golden = np.zeros(dst_shape, dtype=dtype) + + for row in range(src_vr): + for block_start in range(0, src_vc, BLOCK_SIZE): + block_end = min(block_start + BLOCK_SIZE, src_vc) + block_size = block_end - block_start + + block_data = input_data[row, block_start:block_end].copy() + block_idx = idx_data[0 if idx_vr == 1 else row, block_start:block_end].astype(np.int32) + + # For partial blocks, pad with NaN (negative NaN = max value) to make 32 elements + if block_size < BLOCK_SIZE: + # Use the same padding value as in tsort32_template.py + # f16: 0x7C00 (+inf), bf16: 0x7FC0, f32: 0x7FC00000 (negative NaN) + if dtype == np.float16: + pad_val = np.float16(0xFC00) # +inf for f16 + elif hasattr(np, 'bfloat16') and dtype == np.bfloat16: + pad_val = np.bfloat16(0xFF80) + else: + pad_val = np.float32(0xFF800000) # negative NaN for f32 + + # Pad block to 32 elements with +inf (will be sorted to end) + padded_data = np.full(BLOCK_SIZE, pad_val, dtype=dtype) + padded_data[:block_size] = block_data + + # Pad indices to 32 elements (indices for padding elements don't matter) + padded_idx = np.zeros(BLOCK_SIZE, dtype=np.int32) + padded_idx[:block_size] = block_idx + + # Sort the padded 32-element block in descending order + # +inf values will be at the end after sorting + sorted_indices = np.argsort(-padded_data) + sorted_values = padded_data[sorted_indices] + sorted_original_idx = padded_idx[sorted_indices] + + # Output interleaved (value, index) pairs for the full 32-element block + # but only the first block_size elements are valid (padding elements at the end) + dst_offset = block_start * stride_coef + for i in range(BLOCK_SIZE): + golden[row, dst_offset + i * stride_coef] = sorted_values[i] + # Store index as int32 bit pattern + idx_u32 = np.array(sorted_original_idx[i], dtype=np.uint32) + if dtype == np.float16: + idx_bytes = idx_u32.tobytes() + golden[row, dst_offset + i * stride_coef + 1] = np.frombuffer(idx_bytes[:2], dtype=np.float16)[0] + golden[row, dst_offset + i * stride_coef + 2] = np.frombuffer(idx_bytes[2:], dtype=np.float16)[0] + else: + golden[row, dst_offset + i * stride_coef + 1] = idx_u32.view(np.float32) + else: + # Full 32-element block + # Sort by value in descending order (largest to smallest) + sorted_indices = np.argsort(-block_data) + sorted_values = block_data[sorted_indices] + sorted_original_idx = block_idx[sorted_indices] + + # Output interleaved (value, index) pairs with stride_coef + dst_offset = block_start * stride_coef + for i in range(BLOCK_SIZE): + golden[row, dst_offset + i * stride_coef] = sorted_values[i] + # Store index as int32 bit pattern + idx_u32 = np.array(sorted_original_idx[i], dtype=np.uint32) + if dtype == np.float16: + idx_bytes = idx_u32.tobytes() + golden[row, dst_offset + i * stride_coef + 1] = np.frombuffer(idx_bytes[:2], dtype=np.float16)[0] + golden[row, dst_offset + i * stride_coef + 2] = np.frombuffer(idx_bytes[2:], dtype=np.float16)[0] + else: + golden[row, dst_offset + i * stride_coef + 1] = idx_u32.view(np.float32) + + save_case_data(case["name"], {"input": input_data, "idx": idx_data.astype(np.uint32), "golden": golden}) + print(f"[INFO] gen_data: {case['name']} src_shape={src_shape} idx_shape={idx_shape} dst_shape={dst_shape} dtype={dtype.__name__}") diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsort32/launch.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsort32/launch.cpp new file mode 100644 index 000000000..21b4362d7 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsort32/launch.cpp @@ -0,0 +1,28 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +#include + +#ifndef AICORE +#define AICORE [aicore] +#endif + +// Case: f32 1x32 + +extern "C" __global__ AICORE void TSORT32_f16_1x32(__gm__ uint16_t *src, __gm__ uint32_t *idx, __gm__ uint16_t *dst); +extern "C" __global__ AICORE void TSORT32_f32_2x13(__gm__ float *src, __gm__ uint32_t *idx, __gm__ float *dst); + +void LaunchTSORT32_f32_2x13(float *src, uint32_t *idx, float *dst, void *stream) { + TSORT32_f32_2x13<<<1, nullptr, stream>>>((__gm__ float *)src, (__gm__ uint32_t *)idx, (__gm__ float *)dst); +} + + + +void LaunchTSORT32_f16_1x32(uint16_t *src, uint32_t *idx, uint16_t *dst, void *stream) { + TSORT32_f16_1x32<<<1, nullptr, stream>>>((__gm__ uint16_t *)src, (__gm__ uint32_t *)idx, (__gm__ uint16_t *)dst); +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsort32/main.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsort32/main.cpp new file mode 100644 index 000000000..c7335058c --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsort32/main.cpp @@ -0,0 +1,150 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// Host driver for TileLang tsort32 ST — case-table driven. +// Each case launches a different kernel variant, reads/writes from per-case subdirectory. +// Numerical comparison is done externally by compare.py. + +#include "acl/acl.h" +#include "test_common.h" +#include +#include +#include +#include +#include +#include + +using namespace PtoTestCommon; + +// Kernel launch wrappers (defined in launch.cpp) +void LaunchTSORT32_f32_1x64(float *src, uint32_t *idx, float *dst, void *stream); +void LaunchTSORT32_f32_16x32(float *src, uint32_t *idx, float *dst, void *stream); +void LaunchTSORT32_f32_16x64_shared_idx(float *src, uint32_t *idx, float *dst, void *stream); +void LaunchTSORT32_f16_1x32(uint16_t *src, uint32_t *idx, uint16_t *dst, void *stream); +void LaunchTSORT32_f32_2x13(float *src, uint32_t *idx, float *dst, void *stream); +void LaunchTSORT32_f32_2x2084(float *src, uint32_t *idx, float *dst, void *stream); + +using LaunchFn = void (*)(void *, uint32_t *, void *, void *); + +struct TestCase { + const char *name; + LaunchFn launch; + size_t srcRows; + size_t srcCols; + size_t idxRows; + size_t idxCols; + size_t dstRows; + size_t dstCols; + size_t elemSize; // bytes per element +}; + +static const TestCase kCases[] = { +{"f32_2x13", reinterpret_cast(LaunchTSORT32_f32_2x13), 2, 16, 2, 16, 2, 64, sizeof(float)}, +{"f16_1x32", reinterpret_cast(LaunchTSORT32_f16_1x32), 1, 32, 1, 32, 1, 128, sizeof(uint16_t)}, +}; +static constexpr size_t kNumCases = sizeof(kCases) / sizeof(kCases[0]); + +static int RunCase(const TestCase &tc, aclrtStream stream) { + int rc = 0; + size_t srcFileSize = tc.srcRows * tc.srcCols * tc.elemSize; + size_t idxFileSize = tc.idxRows * tc.idxCols * sizeof(uint32_t); + size_t dstFileSize = tc.dstRows * tc.dstCols * tc.elemSize; + + std::printf("[INFO] === case: %s (src=%zux%zu, idx=%zux%zu, dst=%zux%zu) ===\n", + tc.name, tc.srcRows, tc.srcCols, tc.idxRows, tc.idxCols, tc.dstRows, tc.dstCols); + + std::string caseDir = std::string("./") + tc.name; + + void *srcHost = nullptr, *dstHost = nullptr; + uint32_t *idxHost = nullptr; + void *srcDevice = nullptr, *dstDevice = nullptr; + uint32_t *idxDevice = nullptr; + + aclrtMallocHost((void **)(&srcHost), srcFileSize); + aclrtMallocHost((void **)(&idxHost), idxFileSize); + aclrtMallocHost((void **)(&dstHost), dstFileSize); + + aclrtMalloc((void **)&srcDevice, srcFileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc((void **)&idxDevice, idxFileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc((void **)&dstDevice, dstFileSize, ACL_MEM_MALLOC_HUGE_FIRST); + + if (!ReadFile((caseDir + "/input.bin").c_str(), srcFileSize, srcHost, srcFileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input.bin\n", caseDir.c_str()); + rc = 1; + } + if (rc == 0 && !ReadFile((caseDir + "/idx.bin").c_str(), idxFileSize, idxHost, idxFileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/idx.bin\n", caseDir.c_str()); + rc = 1; + } + + if (rc == 0) { + aclrtMemcpy(srcDevice, srcFileSize, srcHost, srcFileSize, ACL_MEMCPY_HOST_TO_DEVICE); + aclrtMemcpy(idxDevice, idxFileSize, idxHost, idxFileSize, ACL_MEMCPY_HOST_TO_DEVICE); + + tc.launch(srcDevice, idxDevice, dstDevice, stream); + + aclrtSynchronizeStream(stream); + aclrtMemcpy(dstHost, dstFileSize, dstDevice, dstFileSize, ACL_MEMCPY_DEVICE_TO_HOST); + } + + if (rc == 0 && !WriteFile((caseDir + "/output.bin").c_str(), dstHost, dstFileSize)) { + std::fprintf(stderr, "[ERROR] failed to write %s/output.bin\n", caseDir.c_str()); + rc = 1; + } + + if (srcDevice != nullptr) + aclrtFree(srcDevice); + if (idxDevice != nullptr) + aclrtFree(idxDevice); + if (dstDevice != nullptr) + aclrtFree(dstDevice); + if (srcHost != nullptr) + aclrtFreeHost(srcHost); + if (idxHost != nullptr) + aclrtFreeHost(idxHost); + if (dstHost != nullptr) + aclrtFreeHost(dstHost); + + if (rc == 0) + std::printf("[INFO] case %s done\n", tc.name); + return rc; +} + +int main(int argc, char *argv[]) { + const char *caseFilter = (argc > 1) ? argv[1] : nullptr; + + int rc = 0; + int deviceId = 0; + aclrtStream stream = nullptr; + + aclInit(nullptr); + if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { + deviceId = std::atoi(envDevice); + } + aclrtSetDevice(deviceId); + aclrtCreateStream(&stream); + + for (size_t i = 0; i < kNumCases; ++i) { + if (caseFilter != nullptr && std::strcmp(kCases[i].name, caseFilter) != 0) { + continue; + } + int ret = RunCase(kCases[i], stream); + if (ret != 0) { + std::fprintf(stderr, "[ERROR] case %s failed\n", kCases[i].name); + rc = 1; + break; + } + } + + if (stream != nullptr) + aclrtDestroyStream(stream); + aclrtResetDevice(deviceId); + aclFinalize(); + + return rc; +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsort32/tsort32.pto b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsort32/tsort32.pto new file mode 100644 index 000000000..5f2571ed7 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsort32/tsort32.pto @@ -0,0 +1,127 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You can not use the file except of compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// TileLang ST kernels for pto.tsort32: sort 32-element blocks with interleaved output. +// Multiple cases with different shapes and shared/broadcast index patterns. +// Compiled by ptoas --enable-insert-sync --enable-tile-op-expand --pto-backend=vpto +// to produce a fatobj object. + +module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { + // Case 0: f32 1x32 - single row, one 32-element block + func.func @TSORT32_f32_2x13(%src_ptr: !pto.ptr, %idx_ptr: !pto.ptr, %dst_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c2 = arith.constant 2 : index + %c13 = arith.constant 13 : index + %c16 = arith.constant 16 : index + %c26 = arith.constant 26 : index + %c32 = arith.constant 32 : index + %c64 = arith.constant 64 : index + %c128 = arith.constant 128 : index + + %src_view = pto.make_tensor_view %src_ptr, + shape = [%c1, %c1, %c1, %c2, %c16], + strides = [%c32, %c32, %c32, %c16, %c1] + : !pto.tensor_view<1x1x1x2x16xf32> + %idx_view = pto.make_tensor_view %idx_ptr, + shape = [%c1, %c1, %c1, %c2, %c16], + strides = [%c32, %c32, %c32, %c16, %c1] + : !pto.tensor_view<1x1x1x2x16xui32> + %dst_view = pto.make_tensor_view %dst_ptr, + shape = [%c1, %c1, %c1, %c2, %c64], + strides = [%c128, %c128, %c128, %c64, %c1] + : !pto.tensor_view<1x1x1x2x64xf32> + + %src_part = pto.partition_view %src_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c2, %c13] + : !pto.tensor_view<1x1x1x2x16xf32> -> !pto.partition_tensor_view<1x1x1x2x13xf32> + %idx_part = pto.partition_view %idx_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c2, %c13] + : !pto.tensor_view<1x1x1x2x16xui32> -> !pto.partition_tensor_view<1x1x1x2x13xui32> + %dst_part = pto.partition_view %dst_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c2, %c26] + : !pto.tensor_view<1x1x1x2x64xf32> -> !pto.partition_tensor_view<1x1x1x2x26xf32> + + %src_tile = pto.alloc_tile : !pto.tile_buf + %idx_tile = pto.alloc_tile : !pto.tile_buf + %dst_tile = pto.alloc_tile : !pto.tile_buf + %tmp_tile = pto.alloc_tile : !pto.tile_buf + + pto.tload ins(%src_part : !pto.partition_tensor_view<1x1x1x2x13xf32>) + outs(%src_tile : !pto.tile_buf) + pto.tload ins(%idx_part : !pto.partition_tensor_view<1x1x1x2x13xui32>) + outs(%idx_tile : !pto.tile_buf) + + pto.tsort32 ins(%src_tile, %idx_tile, %tmp_tile : !pto.tile_buf, + !pto.tile_buf, + !pto.tile_buf) + outs(%dst_tile : !pto.tile_buf) + + pto.tstore ins(%dst_tile : !pto.tile_buf) + outs(%dst_part : !pto.partition_tensor_view<1x1x1x2x26xf32>) + return + } + + // Case: f32 1x4164 - non-32-aligned large shape + // VALID_C=4164, tmp_cols=ceil(4164,8)=4168 (32-byte aligned) + + func.func @TSORT32_f16_1x32(%src_ptr: !pto.ptr, %idx_ptr: !pto.ptr, %dst_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c32 = arith.constant 32 : index + %c128 = arith.constant 128 : index + + %src_view = pto.make_tensor_view %src_ptr, + shape = [%c1, %c1, %c1, %c1, %c32], + strides = [%c32, %c32, %c32, %c32, %c1] + : !pto.tensor_view<1x1x1x1x32xf16> + %idx_view = pto.make_tensor_view %idx_ptr, + shape = [%c1, %c1, %c1, %c1, %c32], + strides = [%c32, %c32, %c32, %c32, %c1] + : !pto.tensor_view<1x1x1x1x32xui32> + %dst_view = pto.make_tensor_view %dst_ptr, + shape = [%c1, %c1, %c1, %c1, %c128], + strides = [%c128, %c128, %c128, %c128, %c1] + : !pto.tensor_view<1x1x1x1x128xf16> + + %src_part = pto.partition_view %src_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c1, %c32] + : !pto.tensor_view<1x1x1x1x32xf16> -> !pto.partition_tensor_view<1x1x1x1x32xf16> + %idx_part = pto.partition_view %idx_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c1, %c32] + : !pto.tensor_view<1x1x1x1x32xui32> -> !pto.partition_tensor_view<1x1x1x1x32xui32> + %dst_part = pto.partition_view %dst_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c1, %c128] + : !pto.tensor_view<1x1x1x1x128xf16> -> !pto.partition_tensor_view<1x1x1x1x128xf16> + + %src_tile = pto.alloc_tile : !pto.tile_buf + %idx_tile = pto.alloc_tile : !pto.tile_buf + %dst_tile = pto.alloc_tile : !pto.tile_buf + + pto.tload ins(%src_part : !pto.partition_tensor_view<1x1x1x1x32xf16>) + outs(%src_tile : !pto.tile_buf) + pto.tload ins(%idx_part : !pto.partition_tensor_view<1x1x1x1x32xui32>) + outs(%idx_tile : !pto.tile_buf) + + pto.tsort32 ins(%src_tile, %idx_tile : !pto.tile_buf, + !pto.tile_buf) + outs(%dst_tile : !pto.tile_buf) + + pto.tstore ins(%dst_tile : !pto.tile_buf) + outs(%dst_part : !pto.partition_tensor_view<1x1x1x1x128xf16>) + return + } + + // Case: f16 4x64 - f16 dtype, 4 rows, two 32-element blocks per row +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsqrt/CMakeLists.txt b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsqrt/CMakeLists.txt new file mode 100644 index 000000000..83de2cda8 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsqrt/CMakeLists.txt @@ -0,0 +1,9 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +pto_tilelang_vec_st(tsqrt) diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsqrt/cases.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsqrt/cases.py new file mode 100644 index 000000000..32fd053b3 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsqrt/cases.py @@ -0,0 +1,82 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +"""Single source of truth for tsqrt ST test cases. + +Each case defines: + - name: case identifier, used as subdirectory name and by main.cpp kCases[]. + - dtype: numpy dtype (e.g. np.float32). + - shape: (rows, cols) — allocated tile dimensions. + - valid_shape: (valid_rows, valid_cols) — effective computation region. + - eps: tolerance for numpy.allclose (atol and rtol). + +gen_data.py and compare.py both import this list to avoid redundant definitions. +""" + +import numpy as np + +CASES = [ + { + "name": "f32_16x64", + "dtype": np.float32, + "shape": (16, 64), + "valid_shape": (16, 64), + "eps": 1e-6, + "high_precision": False, + }, + { + "name": "f32_32x32", + "dtype": np.float32, + "shape": (32, 32), + "valid_shape": (32, 32), + "eps": 1e-6, + "high_precision": False, + }, + { + "name": "f16_16x64", + "dtype": np.float16, + "shape": (16, 64), + "valid_shape": (16, 64), + "eps": 1e-3, + "high_precision": False, + }, + { + "name": "f16_32x32", + "dtype": np.float16, + "shape": (32, 32), + "valid_shape": (32, 32), + "eps": 1e-3, + "high_precision": False, + }, + { + "name": "f32_64x64_hp1", + "dtype": np.float32, + "shape": (64, 64), + "valid_shape": (64, 64), + "eps": 1e-7, + "high_precision": True, + }, + { + "name": "f16_64x64_hp2", + "dtype": np.float16, + "shape": (64, 64), + "valid_shape": (64, 64), + "eps": 1e-7, + "high_precision": True, + }, +] + +_SMOKE_CASE_NAMES = ['f32_16x64', 'f16_16x64'] +_SMOKE_CASE_NAME_SET = set(_SMOKE_CASE_NAMES) +_missing = [name for name in _SMOKE_CASE_NAMES if name not in {case["name"] for case in CASES}] +if _missing: + raise RuntimeError("unknown smoke case(s): " + ", ".join(_missing)) +CASES = [case for case in CASES if case["name"] in _SMOKE_CASE_NAME_SET] diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsqrt/compare.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsqrt/compare.py new file mode 100644 index 000000000..428604929 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsqrt/compare.py @@ -0,0 +1,49 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import os +import sys +import numpy as np + +from cases import CASES +from st_common import result_cmp, style_fail, style_pass, validate_cases + + +def main(): + validate_cases(CASES) + case_filter = sys.argv[1] if len(sys.argv) > 1 else None + + all_passed = True + for case in CASES: + if case_filter is not None and case["name"] != case_filter: + continue + + case_dir = case["name"] + shape = case["shape"] + vr, vc = case["valid_shape"] + + golden = np.fromfile(os.path.join(case_dir, "golden.bin"), dtype=case["dtype"]).reshape(shape) + output = np.fromfile(os.path.join(case_dir, "output.bin"), dtype=case["dtype"]).reshape(shape) + + ok = result_cmp(golden[:vr, :vc], output[:vr, :vc], case["eps"]) + if ok: + print(style_pass(f"[INFO] {case['name']}: compare passed")) + else: + print(style_fail(f"[ERROR] {case['name']}: compare failed")) + all_passed = False + + if not all_passed: + sys.exit(2) + print(style_pass("[INFO] all cases passed")) + + +if __name__ == "__main__": + main() diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsqrt/gen_data.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsqrt/gen_data.py new file mode 100644 index 000000000..aa3c0a036 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsqrt/gen_data.py @@ -0,0 +1,36 @@ +#!/usr/bin/python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import numpy as np +from cases import CASES +from st_common import validate_cases, setup_case_rng, save_case_data + +validate_cases(CASES) + +for case in CASES: + setup_case_rng(case) + + dtype = case["dtype"] + shape = case["shape"] + valid_shape = case["valid_shape"] + high_precision = case.get("high_precision", False) + + if high_precision: + input = np.random.uniform(0.001, 1.0, size=shape).astype(dtype) + else: + input = np.random.uniform(0.1, 100.0, size=shape).astype(dtype) + + golden = np.zeros(shape, dtype=dtype) + vr, vc = valid_shape + golden[:vr, :vc] = np.sqrt(input[:vr, :vc]).astype(dtype, copy=False) + + save_case_data(case["name"], {"input": input, "golden": golden}) + print(f"[INFO] gen_data: {case['name']} shape={shape} valid_shape={valid_shape} dtype={dtype.__name__} high_precision={high_precision}") diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsqrt/launch.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsqrt/launch.cpp new file mode 100644 index 000000000..ea73decd8 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsqrt/launch.cpp @@ -0,0 +1,28 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +#include + +#ifndef AICORE +#define AICORE [aicore] +#endif + +// Case 0: f32 16x64 + +extern "C" __global__ AICORE void TSQRT_f32_16x64(__gm__ float *a, __gm__ float *b); +extern "C" __global__ AICORE void TSQRT_f16_16x64(__gm__ uint16_t *a, __gm__ uint16_t *b); + +void LaunchTSQRT_f32_16x64(void *a, void *b, void *stream) { + TSQRT_f32_16x64<<<1, nullptr, stream>>>((__gm__ float *)a, (__gm__ float *)b); +} + + + +void LaunchTSQRT_f16_16x64(void *a, void *b, void *stream) { + TSQRT_f16_16x64<<<1, nullptr, stream>>>((__gm__ uint16_t *)a, (__gm__ uint16_t *)b); +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsqrt/main.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsqrt/main.cpp new file mode 100644 index 000000000..852ba7682 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsqrt/main.cpp @@ -0,0 +1,136 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// Host driver for TileLang tsqrt ST — case-table driven. +// Each case launches a different kernel variant, reads/writes from per-case subdirectory. +// Numerical comparison is done externally by compare.py. + +#include "acl/acl.h" +#include "test_common.h" +#include +#include +#include +#include +#include +#include + +using namespace PtoTestCommon; + +// Kernel launch wrappers (defined in launch.cpp) +void LaunchTSQRT_f32_16x64(void *a, void *b, void *stream); +void LaunchTSQRT_f32_32x32(void *a, void *b, void *stream); +void LaunchTSQRT_f16_16x64(void *a, void *b, void *stream); +void LaunchTSQRT_f16_32x32(void *a, void *b, void *stream); +void LaunchTSQRT_f16_64x64_hp2(void *a, void *b, void *stream); + +using LaunchFn = void (*)(void *, void *, void *); + +struct TestCase { + const char *name; + LaunchFn launch; + size_t rows; // allocated tile rows + size_t cols; // allocated tile cols + size_t validRows; // effective computation rows (<= rows) + size_t validCols; // effective computation cols (<= cols) + size_t elemSize; // bytes per element +}; + +static const TestCase kCases[] = { +{"f32_16x64", LaunchTSQRT_f32_16x64, 16, 64, 16, 64, sizeof(float)}, +{"f16_16x64", LaunchTSQRT_f16_16x64, 16, 64, 16, 64, sizeof(uint16_t)}, +}; +static constexpr size_t kNumCases = sizeof(kCases) / sizeof(kCases[0]); + +static int RunCase(const TestCase &tc, int deviceId, aclrtStream stream) { + int rc = 0; + const size_t elemCount = tc.rows * tc.cols; + const size_t fileSize = elemCount * tc.elemSize; + + std::printf("[INFO] === case: %s (shape=%zux%zu, valid=%zux%zu) ===\n", + tc.name, tc.rows, tc.cols, tc.validRows, tc.validCols); + + // Per-case data directory + std::string caseDir = std::string("./") + tc.name; + size_t srcFileSize = fileSize; + + void *srcHost = nullptr, *dstHost = nullptr; + void *srcDevice = nullptr, *dstDevice = nullptr; + + aclrtMallocHost((void **)(&srcHost), fileSize); + aclrtMallocHost((void **)(&dstHost), fileSize); + + aclrtMalloc((void **)&srcDevice, fileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc((void **)&dstDevice, fileSize, ACL_MEM_MALLOC_HUGE_FIRST); + + if (!ReadFile((caseDir + "/input.bin").c_str(), srcFileSize, srcHost, fileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input.bin\n", caseDir.c_str()); + rc = 1; + } + + if (rc == 0) { + aclrtMemcpy(srcDevice, fileSize, srcHost, fileSize, ACL_MEMCPY_HOST_TO_DEVICE); + + tc.launch(srcDevice, dstDevice, stream); + + aclrtSynchronizeStream(stream); + aclrtMemcpy(dstHost, fileSize, dstDevice, fileSize, ACL_MEMCPY_DEVICE_TO_HOST); + } + + if (rc == 0 && !WriteFile((caseDir + "/output.bin").c_str(), dstHost, fileSize)) { + std::fprintf(stderr, "[ERROR] failed to write %s/output.bin\n", caseDir.c_str()); + rc = 1; + } + + if (srcDevice != nullptr) + aclrtFree(srcDevice); + if (dstDevice != nullptr) + aclrtFree(dstDevice); + if (srcHost != nullptr) + aclrtFreeHost(srcHost); + if (dstHost != nullptr) + aclrtFreeHost(dstHost); + + if (rc == 0) + std::printf("[INFO] case %s done\n", tc.name); + return rc; +} + +int main(int argc, char *argv[]) { + // Optional case filter: ./tsqrt [case_name] + const char *caseFilter = (argc > 1) ? argv[1] : nullptr; + + int rc = 0; + int deviceId = 0; + aclrtStream stream = nullptr; + + aclInit(nullptr); + if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { + deviceId = std::atoi(envDevice); + } + aclrtSetDevice(deviceId); + aclrtCreateStream(&stream); + + for (size_t i = 0; i < kNumCases; ++i) { + if (caseFilter != nullptr && std::strcmp(kCases[i].name, caseFilter) != 0) { + continue; + } + int ret = RunCase(kCases[i], deviceId, stream); + if (ret != 0) { + std::fprintf(stderr, "[ERROR] case %s failed\n", kCases[i].name); + rc = 1; + break; + } + } + + if (stream != nullptr) + aclrtDestroyStream(stream); + aclrtResetDevice(deviceId); + aclFinalize(); + + return rc; +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsqrt/tsqrt.pto b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsqrt/tsqrt.pto new file mode 100644 index 000000000..0f7c75b55 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsqrt/tsqrt.pto @@ -0,0 +1,101 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// TileLang ST kernels for pto.tsqrt: tload(a) + tsqrt(a)->b + tstore(b). +// Multiple cases with different shapes in a single module. +// Compiled by ptoas --enable-insert-sync --enable-tile-op-expand --pto-backend=vpto +// to produce a fatobj object. + +module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { + // Case 0: f32 16x64 (1024 elements) + func.func @TSQRT_f32_16x64(%a_ptr: !pto.ptr, %b_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c16 = arith.constant 16 : index + %c64 = arith.constant 64 : index + %c1024 = arith.constant 1024 : index + + %a_view = pto.make_tensor_view %a_ptr, + shape = [%c1, %c1, %c1, %c16, %c64], + strides = [%c1024, %c1024, %c1024, %c64, %c1] + : !pto.tensor_view<1x1x1x16x64xf32> + %b_view = pto.make_tensor_view %b_ptr, + shape = [%c1, %c1, %c1, %c16, %c64], + strides = [%c1024, %c1024, %c1024, %c64, %c1] + : !pto.tensor_view<1x1x1x16x64xf32> + + %a_part = pto.partition_view %a_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c16, %c64] + : !pto.tensor_view<1x1x1x16x64xf32> -> !pto.partition_tensor_view<1x1x1x16x64xf32> + %b_part = pto.partition_view %b_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c16, %c64] + : !pto.tensor_view<1x1x1x16x64xf32> -> !pto.partition_tensor_view<1x1x1x16x64xf32> + + %a = pto.alloc_tile + : !pto.tile_buf + %b = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%a_part : !pto.partition_tensor_view<1x1x1x16x64xf32>) + outs(%a : !pto.tile_buf) + + pto.tsqrt ins(%a : !pto.tile_buf) + outs(%b : !pto.tile_buf) + + pto.tstore ins(%b : !pto.tile_buf) + outs(%b_part : !pto.partition_tensor_view<1x1x1x16x64xf32>) + return + } + + // Case 1: f32 32x32 (1024 elements) + + func.func @TSQRT_f16_16x64(%a_ptr: !pto.ptr, %b_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c16 = arith.constant 16 : index + %c64 = arith.constant 64 : index + %c1024 = arith.constant 1024 : index + + %a_view = pto.make_tensor_view %a_ptr, + shape = [%c1, %c1, %c1, %c16, %c64], + strides = [%c1024, %c1024, %c1024, %c64, %c1] + : !pto.tensor_view<1x1x1x16x64xf16> + %b_view = pto.make_tensor_view %b_ptr, + shape = [%c1, %c1, %c1, %c16, %c64], + strides = [%c1024, %c1024, %c1024, %c64, %c1] + : !pto.tensor_view<1x1x1x16x64xf16> + + %a_part = pto.partition_view %a_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c16, %c64] + : !pto.tensor_view<1x1x1x16x64xf16> -> !pto.partition_tensor_view<1x1x1x16x64xf16> + %b_part = pto.partition_view %b_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c16, %c64] + : !pto.tensor_view<1x1x1x16x64xf16> -> !pto.partition_tensor_view<1x1x1x16x64xf16> + + %a = pto.alloc_tile + : !pto.tile_buf + %b = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%a_part : !pto.partition_tensor_view<1x1x1x16x64xf16>) + outs(%a : !pto.tile_buf) + + pto.tsqrt ins(%a : !pto.tile_buf) + outs(%b : !pto.tile_buf) + + pto.tstore ins(%b : !pto.tile_buf) + outs(%b_part : !pto.partition_tensor_view<1x1x1x16x64xf16>) + return + } + + // Case 3: f16 32x32 (1024 elements) +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsub/CMakeLists.txt b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsub/CMakeLists.txt new file mode 100644 index 000000000..635f174d7 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsub/CMakeLists.txt @@ -0,0 +1,9 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +pto_tilelang_vec_st(tsub) diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsub/cases.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsub/cases.py new file mode 100644 index 000000000..51b9c8f6a --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsub/cases.py @@ -0,0 +1,47 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +"""Single source of truth for tsub ST test cases. + +Each case defines: + - name: case identifier, used as subdirectory name and by main.cpp kCases[]. + - dtype: numpy dtype (e.g. np.float32). + - shape: (rows, cols) — allocated tile dimensions. + - valid_shape: (valid_rows, valid_cols) — effective computation region. + - eps: tolerance for numpy.allclose (atol and rtol). + +gen_data.py and compare.py both import this list to avoid redundant definitions. +""" + +import numpy as np + +CASES = [ + { + "name": "f32_16x64", + "dtype": np.float32, + "shape": (16, 64), + "valid_shape": (16, 64), + "eps": 1e-6, + }, + { + "name": "f32_32x32", + "dtype": np.float32, + "shape": (32, 32), + "valid_shape": (32, 32), + "eps": 1e-6, + }, +] + +_SMOKE_CASE_NAMES = ['f32_16x64', 'f32_32x32'] +_SMOKE_CASE_NAME_SET = set(_SMOKE_CASE_NAMES) +_missing = [name for name in _SMOKE_CASE_NAMES if name not in {case["name"] for case in CASES}] +if _missing: + raise RuntimeError("unknown smoke case(s): " + ", ".join(_missing)) +CASES = [case for case in CASES if case["name"] in _SMOKE_CASE_NAME_SET] diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsub/compare.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsub/compare.py new file mode 100644 index 000000000..6a4d5d1aa --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsub/compare.py @@ -0,0 +1,48 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import os +import sys +import numpy as np + +from cases import CASES +from st_common import result_cmp, style_fail, style_pass, validate_cases + + +def main(): + validate_cases(CASES) + case_filter = sys.argv[1] if len(sys.argv) > 1 else None + + all_passed = True + for case in CASES: + if case_filter is not None and case["name"] != case_filter: + continue + + case_dir = case["name"] + shape = case["shape"] + vr, vc = case["valid_shape"] + + golden = np.fromfile(os.path.join(case_dir, "golden.bin"), dtype=case["dtype"]).reshape(shape) + output = np.fromfile(os.path.join(case_dir, "output.bin"), dtype=case["dtype"]).reshape(shape) + + ok = result_cmp(golden[:vr, :vc], output[:vr, :vc], case["eps"]) + if ok: + print(style_pass(f"[INFO] {case['name']}: compare passed")) + else: + print(style_fail(f"[ERROR] {case['name']}: compare failed")) + all_passed = False + + if not all_passed: + sys.exit(2) + print(style_pass("[INFO] all cases passed")) + + +if __name__ == "__main__": + main() diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsub/gen_data.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsub/gen_data.py new file mode 100644 index 000000000..4014495e3 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsub/gen_data.py @@ -0,0 +1,32 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import numpy as np +from cases import CASES +from st_common import validate_cases, setup_case_rng, save_case_data + +validate_cases(CASES) + +for case in CASES: + setup_case_rng(case) + + dtype = case["dtype"] + shape = case["shape"] + valid_shape = case["valid_shape"] + + input1 = np.random.randint(1, 10, size=shape).astype(dtype) + input2 = np.random.randint(1, 10, size=shape).astype(dtype) + + golden = np.zeros(shape, dtype=dtype) + vr, vc = valid_shape + golden[:vr, :vc] = (input1[:vr, :vc] - input2[:vr, :vc]).astype(dtype, copy=False) + + save_case_data(case["name"], {"input1": input1, "input2": input2, "golden": golden}) + print(f"[INFO] gen_data: {case['name']} shape={shape} valid_shape={valid_shape} dtype={dtype.__name__}") diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsub/launch.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsub/launch.cpp new file mode 100644 index 000000000..463ae05f8 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsub/launch.cpp @@ -0,0 +1,28 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +#include + +#ifndef AICORE +#define AICORE [aicore] +#endif + +// Case 0: f32 16x64 + +extern "C" __global__ AICORE void TSUB_f32_16x64(__gm__ float *a, __gm__ float *b, __gm__ float *c); +extern "C" __global__ AICORE void TSUB_f32_32x32(__gm__ float *a, __gm__ float *b, __gm__ float *c); + +void LaunchTSUB_f32_16x64(float *a, float *b, float *c, void *stream) { + TSUB_f32_16x64<<<1, nullptr, stream>>>((__gm__ float *)a, (__gm__ float *)b, (__gm__ float *)c); +} + + + +void LaunchTSUB_f32_32x32(float *a, float *b, float *c, void *stream) { + TSUB_f32_32x32<<<1, nullptr, stream>>>((__gm__ float *)a, (__gm__ float *)b, (__gm__ float *)c); +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsub/main.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsub/main.cpp new file mode 100644 index 000000000..b27b14975 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsub/main.cpp @@ -0,0 +1,145 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// Host driver for TileLang tsub ST — case-table driven. +// Each case launches a different kernel variant, reads/writes from per-case subdirectory. +// Numerical comparison is done externally by compare.py. + +#include "acl/acl.h" +#include "test_common.h" +#include +#include +#include +#include +#include +#include + +using namespace PtoTestCommon; + +// Kernel launch wrappers (defined in launch.cpp) +void LaunchTSUB_f32_16x64(float *a, float *b, float *c, void *stream); +void LaunchTSUB_f32_32x32(float *a, float *b, float *c, void *stream); + +using LaunchFn = void (*)(float *, float *, float *, void *); + +struct TestCase { + const char *name; + LaunchFn launch; + size_t rows; // allocated tile rows + size_t cols; // allocated tile cols + size_t validRows; // effective computation rows (<= rows) + size_t validCols; // effective computation cols (<= cols) + size_t elemSize; // bytes per element +}; + +static const TestCase kCases[] = { +{"f32_16x64", LaunchTSUB_f32_16x64, 16, 64, 16, 64, sizeof(float)}, +{"f32_32x32", LaunchTSUB_f32_32x32, 32, 32, 32, 32, sizeof(float)}, +}; +static constexpr size_t kNumCases = sizeof(kCases) / sizeof(kCases[0]); + +static int RunCase(const TestCase &tc, int deviceId, aclrtStream stream) { + int rc = 0; + const size_t elemCount = tc.rows * tc.cols; + const size_t fileSize = elemCount * tc.elemSize; + + std::printf("[INFO] === case: %s (shape=%zux%zu, valid=%zux%zu) ===\n", + tc.name, tc.rows, tc.cols, tc.validRows, tc.validCols); + + // Per-case data directory + std::string caseDir = std::string("./") + tc.name; + size_t src0FileSize = fileSize; + size_t src1FileSize = fileSize; + + float *src0Host = nullptr, *src1Host = nullptr, *dstHost = nullptr; + float *src0Device = nullptr, *src1Device = nullptr, *dstDevice = nullptr; + + aclrtMallocHost((void **)(&src0Host), fileSize); + aclrtMallocHost((void **)(&src1Host), fileSize); + aclrtMallocHost((void **)(&dstHost), fileSize); + + aclrtMalloc((void **)&src0Device, fileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc((void **)&src1Device, fileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc((void **)&dstDevice, fileSize, ACL_MEM_MALLOC_HUGE_FIRST); + + if (!ReadFile((caseDir + "/input1.bin").c_str(), src0FileSize, src0Host, fileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input1.bin\n", caseDir.c_str()); + rc = 1; + } + if (rc == 0 && !ReadFile((caseDir + "/input2.bin").c_str(), src1FileSize, src1Host, fileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input2.bin\n", caseDir.c_str()); + rc = 1; + } + + if (rc == 0) { + aclrtMemcpy(src0Device, fileSize, src0Host, fileSize, ACL_MEMCPY_HOST_TO_DEVICE); + aclrtMemcpy(src1Device, fileSize, src1Host, fileSize, ACL_MEMCPY_HOST_TO_DEVICE); + + tc.launch(src0Device, src1Device, dstDevice, stream); + + aclrtSynchronizeStream(stream); + aclrtMemcpy(dstHost, fileSize, dstDevice, fileSize, ACL_MEMCPY_DEVICE_TO_HOST); + } + + if (rc == 0 && !WriteFile((caseDir + "/output.bin").c_str(), dstHost, fileSize)) { + std::fprintf(stderr, "[ERROR] failed to write %s/output.bin\n", caseDir.c_str()); + rc = 1; + } + + if (src0Device != nullptr) + aclrtFree(src0Device); + if (src1Device != nullptr) + aclrtFree(src1Device); + if (dstDevice != nullptr) + aclrtFree(dstDevice); + if (src0Host != nullptr) + aclrtFreeHost(src0Host); + if (src1Host != nullptr) + aclrtFreeHost(src1Host); + if (dstHost != nullptr) + aclrtFreeHost(dstHost); + + if (rc == 0) + std::printf("[INFO] case %s done\n", tc.name); + return rc; +} + +int main(int argc, char *argv[]) { + // Optional case filter: ./tsub [case_name] + const char *caseFilter = (argc > 1) ? argv[1] : nullptr; + + int rc = 0; + int deviceId = 0; + aclrtStream stream = nullptr; + + aclInit(nullptr); + if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { + deviceId = std::atoi(envDevice); + } + aclrtSetDevice(deviceId); + aclrtCreateStream(&stream); + + for (size_t i = 0; i < kNumCases; ++i) { + if (caseFilter != nullptr && std::strcmp(kCases[i].name, caseFilter) != 0) { + continue; + } + int ret = RunCase(kCases[i], deviceId, stream); + if (ret != 0) { + std::fprintf(stderr, "[ERROR] case %s failed\n", kCases[i].name); + rc = 1; + break; + } + } + + if (stream != nullptr) + aclrtDestroyStream(stream); + aclrtResetDevice(deviceId); + aclFinalize(); + + return rc; +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsub/tsub.pto b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsub/tsub.pto new file mode 100644 index 000000000..d9d9b27c3 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsub/tsub.pto @@ -0,0 +1,124 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// TileLang ST kernels for pto.tsub: tload(a) + tload(b) + tsub(a,b)->c + tstore(c). +// Multiple cases with different shapes in a single module. +// Compiled by ptoas --enable-insert-sync --enable-tile-op-expand --pto-backend=vpto +// to produce a fatobj object. + +module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { + // Case 0: f32 16x64 (1024 elements) + func.func @TSUB_f32_16x64(%a_ptr: !pto.ptr, %b_ptr: !pto.ptr, %c_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c16 = arith.constant 16 : index + %c64 = arith.constant 64 : index + %c1024 = arith.constant 1024 : index + + %a_view = pto.make_tensor_view %a_ptr, + shape = [%c1, %c1, %c1, %c16, %c64], + strides = [%c1024, %c1024, %c1024, %c64, %c1] + : !pto.tensor_view<1x1x1x16x64xf32> + %b_view = pto.make_tensor_view %b_ptr, + shape = [%c1, %c1, %c1, %c16, %c64], + strides = [%c1024, %c1024, %c1024, %c64, %c1] + : !pto.tensor_view<1x1x1x16x64xf32> + %c_view = pto.make_tensor_view %c_ptr, + shape = [%c1, %c1, %c1, %c16, %c64], + strides = [%c1024, %c1024, %c1024, %c64, %c1] + : !pto.tensor_view<1x1x1x16x64xf32> + + %a_part = pto.partition_view %a_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c16, %c64] + : !pto.tensor_view<1x1x1x16x64xf32> -> !pto.partition_tensor_view<1x1x1x16x64xf32> + %b_part = pto.partition_view %b_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c16, %c64] + : !pto.tensor_view<1x1x1x16x64xf32> -> !pto.partition_tensor_view<1x1x1x16x64xf32> + %c_part = pto.partition_view %c_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c16, %c64] + : !pto.tensor_view<1x1x1x16x64xf32> -> !pto.partition_tensor_view<1x1x1x16x64xf32> + + %a = pto.alloc_tile + : !pto.tile_buf + %b = pto.alloc_tile + : !pto.tile_buf + %c = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%a_part : !pto.partition_tensor_view<1x1x1x16x64xf32>) + outs(%a : !pto.tile_buf) + pto.tload ins(%b_part : !pto.partition_tensor_view<1x1x1x16x64xf32>) + outs(%b : !pto.tile_buf) + + pto.tsub ins(%a, %b : !pto.tile_buf, + !pto.tile_buf) + outs(%c : !pto.tile_buf) + + pto.tstore ins(%c : !pto.tile_buf) + outs(%c_part : !pto.partition_tensor_view<1x1x1x16x64xf32>) + return + } + + // Case 1: f32 32x32 (1024 elements) + + func.func @TSUB_f32_32x32(%a_ptr: !pto.ptr, %b_ptr: !pto.ptr, %c_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c32 = arith.constant 32 : index + %c1024 = arith.constant 1024 : index + + %a_view = pto.make_tensor_view %a_ptr, + shape = [%c1, %c1, %c1, %c32, %c32], + strides = [%c1024, %c1024, %c1024, %c32, %c1] + : !pto.tensor_view<1x1x1x32x32xf32> + %b_view = pto.make_tensor_view %b_ptr, + shape = [%c1, %c1, %c1, %c32, %c32], + strides = [%c1024, %c1024, %c1024, %c32, %c1] + : !pto.tensor_view<1x1x1x32x32xf32> + %c_view = pto.make_tensor_view %c_ptr, + shape = [%c1, %c1, %c1, %c32, %c32], + strides = [%c1024, %c1024, %c1024, %c32, %c1] + : !pto.tensor_view<1x1x1x32x32xf32> + + %a_part = pto.partition_view %a_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c32, %c32] + : !pto.tensor_view<1x1x1x32x32xf32> -> !pto.partition_tensor_view<1x1x1x32x32xf32> + %b_part = pto.partition_view %b_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c32, %c32] + : !pto.tensor_view<1x1x1x32x32xf32> -> !pto.partition_tensor_view<1x1x1x32x32xf32> + %c_part = pto.partition_view %c_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c32, %c32] + : !pto.tensor_view<1x1x1x32x32xf32> -> !pto.partition_tensor_view<1x1x1x32x32xf32> + + %a = pto.alloc_tile + : !pto.tile_buf + %b = pto.alloc_tile + : !pto.tile_buf + %c = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%a_part : !pto.partition_tensor_view<1x1x1x32x32xf32>) + outs(%a : !pto.tile_buf) + pto.tload ins(%b_part : !pto.partition_tensor_view<1x1x1x32x32xf32>) + outs(%b : !pto.tile_buf) + + pto.tsub ins(%a, %b : !pto.tile_buf, + !pto.tile_buf) + outs(%c : !pto.tile_buf) + + pto.tstore ins(%c : !pto.tile_buf) + outs(%c_part : !pto.partition_tensor_view<1x1x1x32x32xf32>) + return + } +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsubs/CMakeLists.txt b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsubs/CMakeLists.txt new file mode 100644 index 000000000..3ccdb0fb1 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsubs/CMakeLists.txt @@ -0,0 +1,9 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +pto_tilelang_vec_st(tsubs) diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsubs/cases.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsubs/cases.py new file mode 100644 index 000000000..93ff384cb --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsubs/cases.py @@ -0,0 +1,29 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +"""Single source of truth for tsubs ST test cases.""" + +import numpy as np + +CASES = [ + {"name": "f32_32x64", "dtype": np.float32, "shape": (32, 64), "valid_shape": (32, 64), "eps": 1e-6}, + {"name": "f16_63x64", "dtype": np.float16, "shape": (63, 64), "valid_shape": (63, 64), "eps": 1e-3}, + {"name": "i32_31x128", "dtype": np.int32, "shape": (31, 128), "valid_shape": (31, 128), "eps": 0}, + {"name": "i16_15x192", "dtype": np.int16, "shape": (15, 192), "valid_shape": (15, 192), "eps": 0}, + {"name": "f32_7x448", "dtype": np.float32, "shape": (7, 448), "valid_shape": (7, 448), "eps": 1e-6}, + {"name": "f32_256x16", "dtype": np.float32, "shape": (256, 16), "valid_shape": (256, 16), "eps": 1e-6}, +] + +_SMOKE_CASE_NAMES = ['f32_32x64', 'i16_15x192'] +_SMOKE_CASE_NAME_SET = set(_SMOKE_CASE_NAMES) +_missing = [name for name in _SMOKE_CASE_NAMES if name not in {case["name"] for case in CASES}] +if _missing: + raise RuntimeError("unknown smoke case(s): " + ", ".join(_missing)) +CASES = [case for case in CASES if case["name"] in _SMOKE_CASE_NAME_SET] diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsubs/compare.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsubs/compare.py new file mode 100644 index 000000000..50186777e --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsubs/compare.py @@ -0,0 +1,46 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import os +import sys +import numpy as np + +from cases import CASES +from st_common import result_cmp, style_fail, style_pass, validate_cases + +def main(): + validate_cases(CASES) + case_filter = sys.argv[1] if len(sys.argv) > 1 else None + + all_passed = True + for case in CASES: + if case_filter is not None and case["name"] != case_filter: + continue + + case_dir = case["name"] + shape = case["shape"] + vr, vc = case["valid_shape"] + + golden = np.fromfile(os.path.join(case_dir, "golden.bin"), dtype=case["dtype"]).reshape(shape) + output = np.fromfile(os.path.join(case_dir, "output.bin"), dtype=case["dtype"]).reshape(shape) + + ok = result_cmp(golden[:vr, :vc], output[:vr, :vc], case["eps"]) + if ok: + print(style_pass(f"[INFO] {case['name']}: compare passed")) + else: + print(style_fail(f"[ERROR] {case['name']}: compare failed")) + all_passed = False + + if not all_passed: + sys.exit(2) + print(style_pass("[INFO] all cases passed")) + +if __name__ == "__main__": + main() diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsubs/gen_data.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsubs/gen_data.py new file mode 100644 index 000000000..20d55e1d6 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsubs/gen_data.py @@ -0,0 +1,35 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import numpy as np +from cases import CASES +from st_common import validate_cases, setup_case_rng, save_case_data + +# Scalar value subtracted from every element (matches the scalar passed in launch.cpp) +SCALAR = 3.0 + +validate_cases(CASES) + +for case in CASES: + setup_case_rng(case) + + dtype = case["dtype"] + shape = case["shape"] + valid_shape = case["valid_shape"] + + input1 = np.random.randint(1, 10, size=shape).astype(dtype) + + golden = np.zeros(shape, dtype=dtype) + vr, vc = valid_shape + scalar_val = dtype(SCALAR) + golden[:vr, :vc] = (input1[:vr, :vc] - scalar_val).astype(dtype, copy=False) + + save_case_data(case["name"], {"input1": input1, "golden": golden}) + print(f"[INFO] gen_data: {case['name']} shape={shape} valid_shape={valid_shape} dtype={dtype.__name__} scalar={SCALAR}") diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsubs/launch.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsubs/launch.cpp new file mode 100644 index 000000000..1e78c7d4b --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsubs/launch.cpp @@ -0,0 +1,31 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +#include + +#ifndef AICORE +#define AICORE [aicore] +#endif + +// Scalar value subtracted from every element (must match gen_data.py SCALAR) +static constexpr float TSUBS_SCALAR_F32 = 3.0f; + +// Case 0: f32 32x64 + +extern "C" __global__ AICORE void TSUBS_f32_32x64(__gm__ float *src, __gm__ float *dst, float scalar); +extern "C" __global__ AICORE void TSUBS_i16_15x192(__gm__ int16_t *src, __gm__ int16_t *dst, int16_t scalar); + +void LaunchTSUBS_i16_15x192(int16_t *src, int16_t *dst, void *stream) { + TSUBS_i16_15x192<<<1, nullptr, stream>>>((__gm__ int16_t *)src, (__gm__ int16_t *)dst, (int16_t)3); +} + + + +void LaunchTSUBS_f32_32x64(float *src, float *dst, void *stream) { + TSUBS_f32_32x64<<<1, nullptr, stream>>>((__gm__ float *)src, (__gm__ float *)dst, TSUBS_SCALAR_F32); +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsubs/main.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsubs/main.cpp new file mode 100644 index 000000000..c68c15131 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsubs/main.cpp @@ -0,0 +1,133 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// Host driver for TileLang tsubs ST — case-table driven. +// tsubs: dst = src - scalar (single input + scalar). +// Numerical comparison is done externally by compare.py. + +#include "acl/acl.h" +#include "test_common.h" +#include +#include +#include +#include +#include +#include + +using namespace PtoTestCommon; + +// Kernel launch wrappers (defined in launch.cpp) +void LaunchTSUBS_f32_32x64(float *src, float *dst, void *stream); +void LaunchTSUBS_f16_63x64(uint16_t *src, uint16_t *dst, void *stream); +void LaunchTSUBS_i16_15x192(int16_t *src, int16_t *dst, void *stream); +void LaunchTSUBS_f32_256x16(float *src, float *dst, void *stream); + +struct TestCase { + const char *name; + void (*launch)(void *, void *, void *); // src, dst, stream + size_t rows; // allocated tile rows + size_t cols; // allocated tile cols + size_t validRows; // effective computation rows (<= rows) + size_t validCols; // effective computation cols (<= cols) + size_t elemSize; // bytes per element +}; + +static const TestCase kCases[] = { +{"f32_32x64", (void (*)(void*,void*,void*))LaunchTSUBS_f32_32x64, 32, 64, 32, 64, sizeof(float)}, +{"i16_15x192", (void (*)(void*,void*,void*))LaunchTSUBS_i16_15x192, 15, 192, 15, 192, sizeof(int16_t)}, +}; +static constexpr size_t kNumCases = sizeof(kCases) / sizeof(kCases[0]); + +static int RunCase(const TestCase &tc, int deviceId, aclrtStream stream) { + int rc = 0; + const size_t elemCount = tc.rows * tc.cols; + const size_t fileSize = elemCount * tc.elemSize; + + std::printf("[INFO] === case: %s (shape=%zux%zu, valid=%zux%zu) ===\n", + tc.name, tc.rows, tc.cols, tc.validRows, tc.validCols); + + // Per-case data directory + std::string caseDir = std::string("./") + tc.name; + size_t srcFileSize = fileSize; + + void *srcHost = nullptr, *dstHost = nullptr; + void *srcDevice = nullptr, *dstDevice = nullptr; + + aclrtMallocHost(&srcHost, fileSize); + aclrtMallocHost(&dstHost, fileSize); + + aclrtMalloc(&srcDevice, fileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc(&dstDevice, fileSize, ACL_MEM_MALLOC_HUGE_FIRST); + + if (!ReadFile((caseDir + "/input1.bin").c_str(), srcFileSize, srcHost, fileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input1.bin\n", caseDir.c_str()); + rc = 1; + } + + if (rc == 0) { + aclrtMemcpy(srcDevice, fileSize, srcHost, fileSize, ACL_MEMCPY_HOST_TO_DEVICE); + + tc.launch(srcDevice, dstDevice, stream); + + aclrtSynchronizeStream(stream); + aclrtMemcpy(dstHost, fileSize, dstDevice, fileSize, ACL_MEMCPY_DEVICE_TO_HOST); + } + + if (rc == 0 && !WriteFile((caseDir + "/output.bin").c_str(), dstHost, fileSize)) { + std::fprintf(stderr, "[ERROR] failed to write %s/output.bin\n", caseDir.c_str()); + rc = 1; + } + + if (srcDevice != nullptr) + aclrtFree(srcDevice); + if (dstDevice != nullptr) + aclrtFree(dstDevice); + if (srcHost != nullptr) + aclrtFreeHost(srcHost); + if (dstHost != nullptr) + aclrtFreeHost(dstHost); + + if (rc == 0) + std::printf("[INFO] case %s done\n", tc.name); + return rc; +} + +int main(int argc, char *argv[]) { + // Optional case filter: ./tsubs [case_name] + const char *caseFilter = (argc > 1) ? argv[1] : nullptr; + + int rc = 0; + int deviceId = 0; + aclrtStream stream = nullptr; + + aclInit(nullptr); + if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { + deviceId = std::atoi(envDevice); + } + aclrtSetDevice(deviceId); + aclrtCreateStream(&stream); + + for (size_t i = 0; i < kNumCases; ++i) { + if (caseFilter != nullptr && std::strcmp(kCases[i].name, caseFilter) != 0) { + continue; + } + int ret = RunCase(kCases[i], deviceId, stream); + if (ret != 0) { + std::fprintf(stderr, "[ERROR] case %s failed\n", kCases[i].name); + rc = 1; + break; + } + } + + if (stream != nullptr) + aclrtDestroyStream(stream); + aclrtResetDevice(deviceId); + aclFinalize(); + + return rc; +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsubs/tsubs.pto b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsubs/tsubs.pto new file mode 100644 index 000000000..657f4b1cd --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/tsubs/tsubs.pto @@ -0,0 +1,98 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// TileLang ST kernels for pto.tsubs: tload(src) + tsubs(src, scalar)->dst + tstore(dst). +// Multiple cases with different shapes/dtypes in a single module. +// Compiled by ptoas --enable-insert-sync --enable-tile-op-expand --pto-backend=vpto +// to produce a fatobj object. + +module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { + + // Case 0: f32 32x64 (2048 elements) + func.func @TSUBS_f32_32x64(%src_ptr: !pto.ptr, %dst_ptr: !pto.ptr, %scalar: f32) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c32 = arith.constant 32 : index + %c64 = arith.constant 64 : index + %c2048 = arith.constant 2048 : index + + %src_view = pto.make_tensor_view %src_ptr, + shape = [%c1, %c1, %c1, %c32, %c64], + strides = [%c2048, %c2048, %c2048, %c64, %c1] + : !pto.tensor_view<1x1x1x32x64xf32> + %dst_view = pto.make_tensor_view %dst_ptr, + shape = [%c1, %c1, %c1, %c32, %c64], + strides = [%c2048, %c2048, %c2048, %c64, %c1] + : !pto.tensor_view<1x1x1x32x64xf32> + + %src_part = pto.partition_view %src_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c32, %c64] + : !pto.tensor_view<1x1x1x32x64xf32> -> !pto.partition_tensor_view<1x1x1x32x64xf32> + %dst_part = pto.partition_view %dst_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c32, %c64] + : !pto.tensor_view<1x1x1x32x64xf32> -> !pto.partition_tensor_view<1x1x1x32x64xf32> + + %src = pto.alloc_tile + : !pto.tile_buf + %dst = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%src_part : !pto.partition_tensor_view<1x1x1x32x64xf32>) + outs(%src : !pto.tile_buf) + pto.tsubs ins(%src, %scalar : !pto.tile_buf, f32) + outs(%dst : !pto.tile_buf) + pto.tstore ins(%dst : !pto.tile_buf) + outs(%dst_part : !pto.partition_tensor_view<1x1x1x32x64xf32>) + return + } + + // Case 1: f16 63x64 (4032 elements) + + func.func @TSUBS_i16_15x192(%src_ptr: !pto.ptr, %dst_ptr: !pto.ptr, %scalar: i16) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c15 = arith.constant 15 : index + %c192 = arith.constant 192 : index + %c2880 = arith.constant 2880 : index + + %src_view = pto.make_tensor_view %src_ptr, + shape = [%c1, %c1, %c1, %c15, %c192], + strides = [%c2880, %c2880, %c2880, %c192, %c1] + : !pto.tensor_view<1x1x1x15x192xi16> + %dst_view = pto.make_tensor_view %dst_ptr, + shape = [%c1, %c1, %c1, %c15, %c192], + strides = [%c2880, %c2880, %c2880, %c192, %c1] + : !pto.tensor_view<1x1x1x15x192xi16> + + %src_part = pto.partition_view %src_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c15, %c192] + : !pto.tensor_view<1x1x1x15x192xi16> -> !pto.partition_tensor_view<1x1x1x15x192xi16> + %dst_part = pto.partition_view %dst_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c15, %c192] + : !pto.tensor_view<1x1x1x15x192xi16> -> !pto.partition_tensor_view<1x1x1x15x192xi16> + + %src = pto.alloc_tile + : !pto.tile_buf + %dst = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%src_part : !pto.partition_tensor_view<1x1x1x15x192xi16>) + outs(%src : !pto.tile_buf) + pto.tsubs ins(%src, %scalar : !pto.tile_buf, i16) + outs(%dst : !pto.tile_buf) + pto.tstore ins(%dst : !pto.tile_buf) + outs(%dst_part : !pto.partition_tensor_view<1x1x1x15x192xi16>) + return + } + + // Case 4: f32 7x448 (3136 elements) +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/txor/CMakeLists.txt b/test/tilelang_st/npu/a5/src/st/smoke/testcase/txor/CMakeLists.txt new file mode 100644 index 000000000..1aa07dbd3 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/txor/CMakeLists.txt @@ -0,0 +1,9 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +pto_tilelang_vec_st(txor) diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/txor/cases.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/txor/cases.py new file mode 100644 index 000000000..ae36d63bd --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/txor/cases.py @@ -0,0 +1,47 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +"""Single source of truth for txor ST test cases. + +Each case defines: + - name: case identifier, used as subdirectory name and by main.cpp kCases[]. + - dtype: numpy dtype (e.g. np.int32). + - shape: (rows, cols) — allocated tile dimensions. + - valid_shape: (valid_rows, valid_cols) — effective computation region. + - eps: tolerance for numpy.allclose (atol and rtol). + +gen_data.py and compare.py both import this list to avoid redundant definitions. +""" + +import numpy as np + +CASES = [ + { + "name": "i32_16x64", + "dtype": np.int32, + "shape": (16, 64), + "valid_shape": (16, 64), + "eps": 0, + }, + { + "name": "i32_32x32", + "dtype": np.int32, + "shape": (32, 32), + "valid_shape": (32, 32), + "eps": 0, + }, +] + +_SMOKE_CASE_NAMES = ['i32_16x64', 'i32_32x32'] +_SMOKE_CASE_NAME_SET = set(_SMOKE_CASE_NAMES) +_missing = [name for name in _SMOKE_CASE_NAMES if name not in {case["name"] for case in CASES}] +if _missing: + raise RuntimeError("unknown smoke case(s): " + ", ".join(_missing)) +CASES = [case for case in CASES if case["name"] in _SMOKE_CASE_NAME_SET] diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/txor/compare.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/txor/compare.py new file mode 100644 index 000000000..6a4d5d1aa --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/txor/compare.py @@ -0,0 +1,48 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import os +import sys +import numpy as np + +from cases import CASES +from st_common import result_cmp, style_fail, style_pass, validate_cases + + +def main(): + validate_cases(CASES) + case_filter = sys.argv[1] if len(sys.argv) > 1 else None + + all_passed = True + for case in CASES: + if case_filter is not None and case["name"] != case_filter: + continue + + case_dir = case["name"] + shape = case["shape"] + vr, vc = case["valid_shape"] + + golden = np.fromfile(os.path.join(case_dir, "golden.bin"), dtype=case["dtype"]).reshape(shape) + output = np.fromfile(os.path.join(case_dir, "output.bin"), dtype=case["dtype"]).reshape(shape) + + ok = result_cmp(golden[:vr, :vc], output[:vr, :vc], case["eps"]) + if ok: + print(style_pass(f"[INFO] {case['name']}: compare passed")) + else: + print(style_fail(f"[ERROR] {case['name']}: compare failed")) + all_passed = False + + if not all_passed: + sys.exit(2) + print(style_pass("[INFO] all cases passed")) + + +if __name__ == "__main__": + main() diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/txor/gen_data.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/txor/gen_data.py new file mode 100644 index 000000000..4b912635e --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/txor/gen_data.py @@ -0,0 +1,32 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import numpy as np +from cases import CASES +from st_common import validate_cases, setup_case_rng, save_case_data + +validate_cases(CASES) + +for case in CASES: + setup_case_rng(case) + + dtype = case["dtype"] + shape = case["shape"] + valid_shape = case["valid_shape"] + + input1 = np.random.randint(0, 100, size=shape).astype(dtype) + input2 = np.random.randint(0, 100, size=shape).astype(dtype) + + golden = np.zeros(shape, dtype=dtype) + vr, vc = valid_shape + golden[:vr, :vc] = (input1[:vr, :vc] ^ input2[:vr, :vc]).astype(dtype, copy=False) + + save_case_data(case["name"], {"input1": input1, "input2": input2, "golden": golden}) + print(f"[INFO] gen_data: {case['name']} shape={shape} valid_shape={valid_shape} dtype={dtype.__name__}") diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/txor/launch.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/txor/launch.cpp new file mode 100644 index 000000000..eb8a6ebd3 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/txor/launch.cpp @@ -0,0 +1,26 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +#include + +#ifndef AICORE +#define AICORE [aicore] +#endif + +// Case 0: i32 16x64 + +extern "C" __global__ AICORE void TXOR_i32_16x64(__gm__ int32_t *a, __gm__ int32_t *b, __gm__ int32_t *c); +extern "C" __global__ AICORE void TXOR_i32_32x32(__gm__ int32_t *a, __gm__ int32_t *b, __gm__ int32_t *c); + +void LaunchTXOR_i32_32x32(int32_t *a, int32_t *b, int32_t *c, void *stream) { + TXOR_i32_32x32<<<1, nullptr, stream>>>((__gm__ int32_t *)a, (__gm__ int32_t *)b, (__gm__ int32_t *)c); +} + +void LaunchTXOR_i32_16x64(int32_t *a, int32_t *b, int32_t *c, void *stream) { + TXOR_i32_16x64<<<1, nullptr, stream>>>((__gm__ int32_t *)a, (__gm__ int32_t *)b, (__gm__ int32_t *)c); +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/txor/main.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/txor/main.cpp new file mode 100644 index 000000000..9f8729c80 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/txor/main.cpp @@ -0,0 +1,145 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// Host driver for TileLang txor ST — case-table driven. +// Each case launches a different kernel variant, reads/writes from per-case subdirectory. +// Numerical comparison is done externally by compare.py. + +#include "acl/acl.h" +#include "test_common.h" +#include +#include +#include +#include +#include +#include + +using namespace PtoTestCommon; + +// Kernel launch wrappers (defined in launch.cpp) +void LaunchTXOR_i32_16x64(int32_t *a, int32_t *b, int32_t *c, void *stream); +void LaunchTXOR_i32_32x32(int32_t *a, int32_t *b, int32_t *c, void *stream); + +using LaunchFn = void (*)(int32_t *, int32_t *, int32_t *, void *); + +struct TestCase { + const char *name; + LaunchFn launch; + size_t rows; // allocated tile rows + size_t cols; // allocated tile cols + size_t validRows; // effective computation rows (<= rows) + size_t validCols; // effective computation cols (<= cols) + size_t elemSize; // bytes per element +}; + +static const TestCase kCases[] = { +{"i32_16x64", LaunchTXOR_i32_16x64, 16, 64, 16, 64, sizeof(int32_t)}, +{"i32_32x32", LaunchTXOR_i32_32x32, 32, 32, 32, 32, sizeof(int32_t)}, +}; +static constexpr size_t kNumCases = sizeof(kCases) / sizeof(kCases[0]); + +static int RunCase(const TestCase &tc, int deviceId, aclrtStream stream) { + int rc = 0; + const size_t elemCount = tc.rows * tc.cols; + const size_t fileSize = elemCount * tc.elemSize; + + std::printf("[INFO] === case: %s (shape=%zux%zu, valid=%zux%zu) ===\n", + tc.name, tc.rows, tc.cols, tc.validRows, tc.validCols); + + // Per-case data directory + std::string caseDir = std::string("./") + tc.name; + size_t src0FileSize = fileSize; + size_t src1FileSize = fileSize; + + int32_t *src0Host = nullptr, *src1Host = nullptr, *dstHost = nullptr; + int32_t *src0Device = nullptr, *src1Device = nullptr, *dstDevice = nullptr; + + aclrtMallocHost((void **)(&src0Host), fileSize); + aclrtMallocHost((void **)(&src1Host), fileSize); + aclrtMallocHost((void **)(&dstHost), fileSize); + + aclrtMalloc((void **)&src0Device, fileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc((void **)&src1Device, fileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc((void **)&dstDevice, fileSize, ACL_MEM_MALLOC_HUGE_FIRST); + + if (!ReadFile((caseDir + "/input1.bin").c_str(), src0FileSize, src0Host, fileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input1.bin\n", caseDir.c_str()); + rc = 1; + } + if (rc == 0 && !ReadFile((caseDir + "/input2.bin").c_str(), src1FileSize, src1Host, fileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input2.bin\n", caseDir.c_str()); + rc = 1; + } + + if (rc == 0) { + aclrtMemcpy(src0Device, fileSize, src0Host, fileSize, ACL_MEMCPY_HOST_TO_DEVICE); + aclrtMemcpy(src1Device, fileSize, src1Host, fileSize, ACL_MEMCPY_HOST_TO_DEVICE); + + tc.launch(src0Device, src1Device, dstDevice, stream); + + aclrtSynchronizeStream(stream); + aclrtMemcpy(dstHost, fileSize, dstDevice, fileSize, ACL_MEMCPY_DEVICE_TO_HOST); + } + + if (rc == 0 && !WriteFile((caseDir + "/output.bin").c_str(), dstHost, fileSize)) { + std::fprintf(stderr, "[ERROR] failed to write %s/output.bin\n", caseDir.c_str()); + rc = 1; + } + + if (src0Device != nullptr) + aclrtFree(src0Device); + if (src1Device != nullptr) + aclrtFree(src1Device); + if (dstDevice != nullptr) + aclrtFree(dstDevice); + if (src0Host != nullptr) + aclrtFreeHost(src0Host); + if (src1Host != nullptr) + aclrtFreeHost(src1Host); + if (dstHost != nullptr) + aclrtFreeHost(dstHost); + + if (rc == 0) + std::printf("[INFO] case %s done\n", tc.name); + return rc; +} + +int main(int argc, char *argv[]) { + // Optional case filter: ./txor [case_name] + const char *caseFilter = (argc > 1) ? argv[1] : nullptr; + + int rc = 0; + int deviceId = 0; + aclrtStream stream = nullptr; + + aclInit(nullptr); + if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { + deviceId = std::atoi(envDevice); + } + aclrtSetDevice(deviceId); + aclrtCreateStream(&stream); + + for (size_t i = 0; i < kNumCases; ++i) { + if (caseFilter != nullptr && std::strcmp(kCases[i].name, caseFilter) != 0) { + continue; + } + int ret = RunCase(kCases[i], deviceId, stream); + if (ret != 0) { + std::fprintf(stderr, "[ERROR] case %s failed\n", kCases[i].name); + rc = 1; + break; + } + } + + if (stream != nullptr) + aclrtDestroyStream(stream); + aclrtResetDevice(deviceId); + aclFinalize(); + + return rc; +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/txor/txor.pto b/test/tilelang_st/npu/a5/src/st/smoke/testcase/txor/txor.pto new file mode 100644 index 000000000..d3445f0d4 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/txor/txor.pto @@ -0,0 +1,126 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// TileLang ST kernels for pto.txor: tload(a) + tload(b) + txor(a,b)->c + tstore(c). +// Multiple cases with different shapes in a single module. +// Compiled by ptoas --enable-insert-sync --enable-tile-op-expand --pto-backend=vpto +// to produce a fatobj object. + +module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { + // Case 0: i32 16x64 (1024 elements) + func.func @TXOR_i32_16x64(%a_ptr: !pto.ptr, %b_ptr: !pto.ptr, %c_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c16 = arith.constant 16 : index + %c64 = arith.constant 64 : index + %c1024 = arith.constant 1024 : index + + %a_view = pto.make_tensor_view %a_ptr, + shape = [%c1, %c1, %c1, %c16, %c64], + strides = [%c1024, %c1024, %c1024, %c64, %c1] + : !pto.tensor_view<1x1x1x16x64xi32> + %b_view = pto.make_tensor_view %b_ptr, + shape = [%c1, %c1, %c1, %c16, %c64], + strides = [%c1024, %c1024, %c1024, %c64, %c1] + : !pto.tensor_view<1x1x1x16x64xi32> + %c_view = pto.make_tensor_view %c_ptr, + shape = [%c1, %c1, %c1, %c16, %c64], + strides = [%c1024, %c1024, %c1024, %c64, %c1] + : !pto.tensor_view<1x1x1x16x64xi32> + + %a_part = pto.partition_view %a_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c16, %c64] + : !pto.tensor_view<1x1x1x16x64xi32> -> !pto.partition_tensor_view<1x1x1x16x64xi32> + %b_part = pto.partition_view %b_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c16, %c64] + : !pto.tensor_view<1x1x1x16x64xi32> -> !pto.partition_tensor_view<1x1x1x16x64xi32> + %c_part = pto.partition_view %c_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c16, %c64] + : !pto.tensor_view<1x1x1x16x64xi32> -> !pto.partition_tensor_view<1x1x1x16x64xi32> + + %a = pto.alloc_tile + : !pto.tile_buf + %b = pto.alloc_tile + : !pto.tile_buf + %c = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%a_part : !pto.partition_tensor_view<1x1x1x16x64xi32>) + outs(%a : !pto.tile_buf) + pto.tload ins(%b_part : !pto.partition_tensor_view<1x1x1x16x64xi32>) + outs(%b : !pto.tile_buf) + + pto.txor ins(%a, %b, %c : !pto.tile_buf, + !pto.tile_buf, + !pto.tile_buf) + outs(%c : !pto.tile_buf) + + pto.tstore ins(%c : !pto.tile_buf) + outs(%c_part : !pto.partition_tensor_view<1x1x1x16x64xi32>) + return + } + + // Case 1: i32 32x32 (1024 elements) + + func.func @TXOR_i32_32x32(%a_ptr: !pto.ptr, %b_ptr: !pto.ptr, %c_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c32 = arith.constant 32 : index + %c1024 = arith.constant 1024 : index + + %a_view = pto.make_tensor_view %a_ptr, + shape = [%c1, %c1, %c1, %c32, %c32], + strides = [%c1024, %c1024, %c1024, %c32, %c1] + : !pto.tensor_view<1x1x1x32x32xi32> + %b_view = pto.make_tensor_view %b_ptr, + shape = [%c1, %c1, %c1, %c32, %c32], + strides = [%c1024, %c1024, %c1024, %c32, %c1] + : !pto.tensor_view<1x1x1x32x32xi32> + %c_view = pto.make_tensor_view %c_ptr, + shape = [%c1, %c1, %c1, %c32, %c32], + strides = [%c1024, %c1024, %c1024, %c32, %c1] + : !pto.tensor_view<1x1x1x32x32xi32> + + %a_part = pto.partition_view %a_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c32, %c32] + : !pto.tensor_view<1x1x1x32x32xi32> -> !pto.partition_tensor_view<1x1x1x32x32xi32> + %b_part = pto.partition_view %b_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c32, %c32] + : !pto.tensor_view<1x1x1x32x32xi32> -> !pto.partition_tensor_view<1x1x1x32x32xi32> + %c_part = pto.partition_view %c_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c32, %c32] + : !pto.tensor_view<1x1x1x32x32xi32> -> !pto.partition_tensor_view<1x1x1x32x32xi32> + + %a = pto.alloc_tile + : !pto.tile_buf + %b = pto.alloc_tile + : !pto.tile_buf + %c = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%a_part : !pto.partition_tensor_view<1x1x1x32x32xi32>) + outs(%a : !pto.tile_buf) + pto.tload ins(%b_part : !pto.partition_tensor_view<1x1x1x32x32xi32>) + outs(%b : !pto.tile_buf) + + pto.txor ins(%a, %b, %c : !pto.tile_buf, + !pto.tile_buf, + !pto.tile_buf) + outs(%c : !pto.tile_buf) + + pto.tstore ins(%c : !pto.tile_buf) + outs(%c_part : !pto.partition_tensor_view<1x1x1x32x32xi32>) + return + } +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/txors/CMakeLists.txt b/test/tilelang_st/npu/a5/src/st/smoke/testcase/txors/CMakeLists.txt new file mode 100644 index 000000000..1bcd9e681 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/txors/CMakeLists.txt @@ -0,0 +1,9 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +pto_tilelang_vec_st(txors) diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/txors/cases.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/txors/cases.py new file mode 100644 index 000000000..4eb1f9b8b --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/txors/cases.py @@ -0,0 +1,55 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +"""Single source of truth for txors ST test cases. + +txors: bitwise XOR with scalar, dst = src ^ scalar. +Integer only: i32, i16. +""" + +import numpy as np + +CASES = [ + { + "name": "i32_32x64", + "dtype": np.int32, + "shape": (32, 64), + "valid_shape": (32, 64), + "eps": 0, + }, + { + "name": "i16_63x64", + "dtype": np.int16, + "shape": (63, 64), + "valid_shape": (63, 64), + "eps": 0, + }, + { + "name": "i32_31x128", + "dtype": np.int32, + "shape": (31, 128), + "valid_shape": (31, 128), + "eps": 0, + }, + { + "name": "i16_15x192", + "dtype": np.int16, + "shape": (15, 192), + "valid_shape": (15, 192), + "eps": 0, + }, +] + +_SMOKE_CASE_NAMES = ['i32_32x64', 'i16_15x192'] +_SMOKE_CASE_NAME_SET = set(_SMOKE_CASE_NAMES) +_missing = [name for name in _SMOKE_CASE_NAMES if name not in {case["name"] for case in CASES}] +if _missing: + raise RuntimeError("unknown smoke case(s): " + ", ".join(_missing)) +CASES = [case for case in CASES if case["name"] in _SMOKE_CASE_NAME_SET] diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/txors/compare.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/txors/compare.py new file mode 100644 index 000000000..50186777e --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/txors/compare.py @@ -0,0 +1,46 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import os +import sys +import numpy as np + +from cases import CASES +from st_common import result_cmp, style_fail, style_pass, validate_cases + +def main(): + validate_cases(CASES) + case_filter = sys.argv[1] if len(sys.argv) > 1 else None + + all_passed = True + for case in CASES: + if case_filter is not None and case["name"] != case_filter: + continue + + case_dir = case["name"] + shape = case["shape"] + vr, vc = case["valid_shape"] + + golden = np.fromfile(os.path.join(case_dir, "golden.bin"), dtype=case["dtype"]).reshape(shape) + output = np.fromfile(os.path.join(case_dir, "output.bin"), dtype=case["dtype"]).reshape(shape) + + ok = result_cmp(golden[:vr, :vc], output[:vr, :vc], case["eps"]) + if ok: + print(style_pass(f"[INFO] {case['name']}: compare passed")) + else: + print(style_fail(f"[ERROR] {case['name']}: compare failed")) + all_passed = False + + if not all_passed: + sys.exit(2) + print(style_pass("[INFO] all cases passed")) + +if __name__ == "__main__": + main() diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/txors/gen_data.py b/test/tilelang_st/npu/a5/src/st/smoke/testcase/txors/gen_data.py new file mode 100644 index 000000000..5c12edd5a --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/txors/gen_data.py @@ -0,0 +1,35 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +# coding=utf-8 + +import numpy as np +from cases import CASES +from st_common import validate_cases, setup_case_rng, save_case_data + +# Scalar value for bitwise XOR (matches the scalar passed in launch.cpp) +SCALAR = 3 + +validate_cases(CASES) + +for case in CASES: + setup_case_rng(case) + + dtype = case["dtype"] + shape = case["shape"] + valid_shape = case["valid_shape"] + + input1 = np.random.randint(1, 10, size=shape).astype(dtype) + + golden = np.zeros(shape, dtype=dtype) + vr, vc = valid_shape + scalar_val = dtype(SCALAR) + golden[:vr, :vc] = (input1[:vr, :vc] ^ scalar_val).astype(dtype, copy=False) + + save_case_data(case["name"], {"input1": input1, "golden": golden}) + print(f"[INFO] gen_data: {case['name']} shape={shape} valid_shape={valid_shape} dtype={dtype.__name__} scalar={SCALAR}") diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/txors/launch.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/txors/launch.cpp new file mode 100644 index 000000000..9edaf56a5 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/txors/launch.cpp @@ -0,0 +1,28 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +#include + +#ifndef AICORE +#define AICORE [aicore] +#endif + +// Case 0: i32 32x64 + +extern "C" __global__ AICORE void TXORS_i32_32x64(__gm__ int32_t *src, __gm__ int32_t *dst, int32_t scalar); +extern "C" __global__ AICORE void TXORS_i16_15x192(__gm__ int16_t *src, __gm__ int16_t *dst, int16_t scalar); + +void LaunchTXORS_i32_32x64(int32_t *src, int32_t *dst, void *stream) { + TXORS_i32_32x64<<<1, nullptr, stream>>>((__gm__ int32_t *)src, (__gm__ int32_t *)dst, (int32_t)3); +} + + + +void LaunchTXORS_i16_15x192(int16_t *src, int16_t *dst, void *stream) { + TXORS_i16_15x192<<<1, nullptr, stream>>>((__gm__ int16_t *)src, (__gm__ int16_t *)dst, (int16_t)3); +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/txors/main.cpp b/test/tilelang_st/npu/a5/src/st/smoke/testcase/txors/main.cpp new file mode 100644 index 000000000..547614df0 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/txors/main.cpp @@ -0,0 +1,132 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// Host driver for TileLang txors ST — case-table driven. +// txors: dst = src ^ scalar (bitwise XOR with scalar). +// Numerical comparison is done externally by compare.py. + +#include "acl/acl.h" +#include "test_common.h" +#include +#include +#include +#include +#include +#include + +using namespace PtoTestCommon; + +// Kernel launch wrappers (defined in launch.cpp) +void LaunchTXORS_i32_32x64(int32_t *src, int32_t *dst, void *stream); +void LaunchTXORS_i16_63x64(int16_t *src, int16_t *dst, void *stream); +void LaunchTXORS_i16_15x192(int16_t *src, int16_t *dst, void *stream); + +struct TestCase { + const char *name; + void (*launch)(void *, void *, void *); // src, dst, stream + size_t rows; // allocated tile rows + size_t cols; // allocated tile cols + size_t validRows; // effective computation rows (<= rows) + size_t validCols; // effective computation cols (<= cols) + size_t elemSize; // bytes per element +}; + +static const TestCase kCases[] = { +{"i32_32x64", (void (*)(void*,void*,void*))LaunchTXORS_i32_32x64, 32, 64, 32, 64, sizeof(int32_t)}, +{"i16_15x192", (void (*)(void*,void*,void*))LaunchTXORS_i16_15x192, 15, 192, 15, 192, sizeof(int16_t)}, +}; +static constexpr size_t kNumCases = sizeof(kCases) / sizeof(kCases[0]); + +static int RunCase(const TestCase &tc, int deviceId, aclrtStream stream) { + int rc = 0; + const size_t elemCount = tc.rows * tc.cols; + const size_t fileSize = elemCount * tc.elemSize; + + std::printf("[INFO] === case: %s (shape=%zux%zu, valid=%zux%zu) ===\n", + tc.name, tc.rows, tc.cols, tc.validRows, tc.validCols); + + // Per-case data directory + std::string caseDir = std::string("./") + tc.name; + size_t srcFileSize = fileSize; + + void *srcHost = nullptr, *dstHost = nullptr; + void *srcDevice = nullptr, *dstDevice = nullptr; + + aclrtMallocHost(&srcHost, fileSize); + aclrtMallocHost(&dstHost, fileSize); + + aclrtMalloc(&srcDevice, fileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc(&dstDevice, fileSize, ACL_MEM_MALLOC_HUGE_FIRST); + + if (!ReadFile((caseDir + "/input1.bin").c_str(), srcFileSize, srcHost, fileSize)) { + std::fprintf(stderr, "[ERROR] failed to read %s/input1.bin\n", caseDir.c_str()); + rc = 1; + } + + if (rc == 0) { + aclrtMemcpy(srcDevice, fileSize, srcHost, fileSize, ACL_MEMCPY_HOST_TO_DEVICE); + + tc.launch(srcDevice, dstDevice, stream); + + aclrtSynchronizeStream(stream); + aclrtMemcpy(dstHost, fileSize, dstDevice, fileSize, ACL_MEMCPY_DEVICE_TO_HOST); + } + + if (rc == 0 && !WriteFile((caseDir + "/output.bin").c_str(), dstHost, fileSize)) { + std::fprintf(stderr, "[ERROR] failed to write %s/output.bin\n", caseDir.c_str()); + rc = 1; + } + + if (srcDevice != nullptr) + aclrtFree(srcDevice); + if (dstDevice != nullptr) + aclrtFree(dstDevice); + if (srcHost != nullptr) + aclrtFreeHost(srcHost); + if (dstHost != nullptr) + aclrtFreeHost(dstHost); + + if (rc == 0) + std::printf("[INFO] case %s done\n", tc.name); + return rc; +} + +int main(int argc, char *argv[]) { + // Optional case filter: ./txors [case_name] + const char *caseFilter = (argc > 1) ? argv[1] : nullptr; + + int rc = 0; + int deviceId = 0; + aclrtStream stream = nullptr; + + aclInit(nullptr); + if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { + deviceId = std::atoi(envDevice); + } + aclrtSetDevice(deviceId); + aclrtCreateStream(&stream); + + for (size_t i = 0; i < kNumCases; ++i) { + if (caseFilter != nullptr && std::strcmp(kCases[i].name, caseFilter) != 0) { + continue; + } + int ret = RunCase(kCases[i], deviceId, stream); + if (ret != 0) { + std::fprintf(stderr, "[ERROR] case %s failed\n", kCases[i].name); + rc = 1; + break; + } + } + + if (stream != nullptr) + aclrtDestroyStream(stream); + aclrtResetDevice(deviceId); + aclFinalize(); + + return rc; +} diff --git a/test/tilelang_st/npu/a5/src/st/smoke/testcase/txors/txors.pto b/test/tilelang_st/npu/a5/src/st/smoke/testcase/txors/txors.pto new file mode 100644 index 000000000..c42ef3285 --- /dev/null +++ b/test/tilelang_st/npu/a5/src/st/smoke/testcase/txors/txors.pto @@ -0,0 +1,103 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// TileLang ST kernels for pto.txors: tload(src) + txors(src, scalar, tmp)->dst + tstore(dst). +// Bitwise XOR with scalar: dst = src ^ scalar. +// Integer only: i32, i16. +// Compiled by ptoas --enable-insert-sync --enable-tile-op-expand --pto-backend=vpto +// to produce a fatobj object. + +module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { + + // Case 0: i32 32x64 (2048 elements) + func.func @TXORS_i32_32x64(%src_ptr: !pto.ptr, %dst_ptr: !pto.ptr, %scalar: i32) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c32 = arith.constant 32 : index + %c64 = arith.constant 64 : index + %c2048 = arith.constant 2048 : index + + %src_view = pto.make_tensor_view %src_ptr, + shape = [%c1, %c1, %c1, %c32, %c64], + strides = [%c2048, %c2048, %c2048, %c64, %c1] + : !pto.tensor_view<1x1x1x32x64xi32> + %dst_view = pto.make_tensor_view %dst_ptr, + shape = [%c1, %c1, %c1, %c32, %c64], + strides = [%c2048, %c2048, %c2048, %c64, %c1] + : !pto.tensor_view<1x1x1x32x64xi32> + + %src_part = pto.partition_view %src_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c32, %c64] + : !pto.tensor_view<1x1x1x32x64xi32> -> !pto.partition_tensor_view<1x1x1x32x64xi32> + %dst_part = pto.partition_view %dst_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c32, %c64] + : !pto.tensor_view<1x1x1x32x64xi32> -> !pto.partition_tensor_view<1x1x1x32x64xi32> + + %src = pto.alloc_tile + : !pto.tile_buf + %dst = pto.alloc_tile + : !pto.tile_buf + %tmp = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%src_part : !pto.partition_tensor_view<1x1x1x32x64xi32>) + outs(%src : !pto.tile_buf) + pto.txors ins(%src, %scalar, %tmp : !pto.tile_buf, i32, + !pto.tile_buf) + outs(%dst : !pto.tile_buf) + pto.tstore ins(%dst : !pto.tile_buf) + outs(%dst_part : !pto.partition_tensor_view<1x1x1x32x64xi32>) + return + } + + // Case 1: i16 63x64 (4032 elements) + + func.func @TXORS_i16_15x192(%src_ptr: !pto.ptr, %dst_ptr: !pto.ptr, %scalar: i16) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c15 = arith.constant 15 : index + %c192 = arith.constant 192 : index + %c2880 = arith.constant 2880 : index + + %src_view = pto.make_tensor_view %src_ptr, + shape = [%c1, %c1, %c1, %c15, %c192], + strides = [%c2880, %c2880, %c2880, %c192, %c1] + : !pto.tensor_view<1x1x1x15x192xi16> + %dst_view = pto.make_tensor_view %dst_ptr, + shape = [%c1, %c1, %c1, %c15, %c192], + strides = [%c2880, %c2880, %c2880, %c192, %c1] + : !pto.tensor_view<1x1x1x15x192xi16> + + %src_part = pto.partition_view %src_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c15, %c192] + : !pto.tensor_view<1x1x1x15x192xi16> -> !pto.partition_tensor_view<1x1x1x15x192xi16> + %dst_part = pto.partition_view %dst_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c15, %c192] + : !pto.tensor_view<1x1x1x15x192xi16> -> !pto.partition_tensor_view<1x1x1x15x192xi16> + + %src = pto.alloc_tile + : !pto.tile_buf + %dst = pto.alloc_tile + : !pto.tile_buf + %tmp = pto.alloc_tile + : !pto.tile_buf + + pto.tload ins(%src_part : !pto.partition_tensor_view<1x1x1x15x192xi16>) + outs(%src : !pto.tile_buf) + pto.txors ins(%src, %scalar, %tmp : !pto.tile_buf, i16, + !pto.tile_buf) + outs(%dst : !pto.tile_buf) + pto.tstore ins(%dst : !pto.tile_buf) + outs(%dst_part : !pto.partition_tensor_view<1x1x1x15x192xi16>) + return + } +} diff --git a/test/tilelang_st/npu/a5/src/st/testcase/CMakeLists.txt b/test/tilelang_st/npu/a5/src/st/testcase/CMakeLists.txt index 9bc5ad33e..5d2dc27b9 100644 --- a/test/tilelang_st/npu/a5/src/st/testcase/CMakeLists.txt +++ b/test/tilelang_st/npu/a5/src/st/testcase/CMakeLists.txt @@ -16,6 +16,11 @@ # 3. Builds host executable from main.cpp (no GTest — comparison via compare.py) # -------------------------------------------------------------------------- set(PTO_TILELANG_ST_TESTCASE_DIR ${CMAKE_CURRENT_LIST_DIR}) +if(DEFINED ENV{TILELANG_ST_SIM_LIB_DIR}) + set(PTO_TILELANG_ST_SIM_LIB_DIR $ENV{TILELANG_ST_SIM_LIB_DIR}) +else() + set(PTO_TILELANG_ST_SIM_LIB_DIR ${ASCEND_HOME_PATH}/tools/simulator/${SOC_VERSION}/lib) +endif() function(pto_tilelang_st NAME) set(options DISABLE_INSERT_SYNC) @@ -78,7 +83,7 @@ function(pto_tilelang_st NAME) target_link_directories(${NAME} PUBLIC ${ASCEND_HOME_PATH}/lib64 - ${ASCEND_HOME_PATH}/tools/simulator/${SOC_VERSION}/lib + ${PTO_TILELANG_ST_SIM_LIB_DIR} ) target_link_libraries(${NAME} PRIVATE diff --git a/test/tilelang_st/script/run_all_st.py b/test/tilelang_st/script/run_all_st.py index e1b47c54f..cdc9a686f 100755 --- a/test/tilelang_st/script/run_all_st.py +++ b/test/tilelang_st/script/run_all_st.py @@ -24,9 +24,6 @@ "a5": "Ascend950PR_9599", } -SMOKE_CASE_LIMIT = 1 - - def discover_testcases(testcase_root): testcases = [] for entry in sorted(os.listdir(testcase_root)): @@ -56,7 +53,16 @@ def resolve_case_filters(testcase_root, testcase, smoke_mode): case_names = load_case_names(testcase_root, testcase) if not case_names: raise ValueError(f"no cases found for smoke testcase: {testcase}") - return case_names[:SMOKE_CASE_LIMIT] + return [] + + +def resolve_smoke_case_names(testcase_root, testcase, smoke_mode): + if not smoke_mode: + return [] + case_names = load_case_names(testcase_root, testcase) + if not case_names: + raise ValueError(f"no cases found for smoke testcase: {testcase}") + return case_names def parse_args(): @@ -122,7 +128,9 @@ def resolve_selected_testcases(all_testcases, requested): return requested_set -def run_testcase_subprocess(run_st_script_path, run_mode, soc_version, ptoas_bin, testcase, case_filters=None): +def run_testcase_subprocess( + run_st_script_path, run_mode, soc_version, ptoas_bin, target_dir, testcase, case_filters=None +): command = [ sys.executable, run_st_script_path, @@ -130,6 +138,7 @@ def run_testcase_subprocess(run_st_script_path, run_mode, soc_version, ptoas_bin "-v", soc_version, "-t", testcase, "-p", ptoas_bin, + "--target-dir", target_dir, "-w", ] for case_filter in case_filters or []: @@ -163,10 +172,15 @@ def main(): batch_script_path = os.path.abspath(__file__) run_st_script_path = os.path.abspath(run_st.__file__) tilelang_st_root = os.path.dirname(os.path.dirname(batch_script_path)) - testcase_root = os.path.join( - tilelang_st_root, "npu", args.soc_version, "src", "st", "testcase" - ) - target_dir = os.path.dirname(testcase_root) + st_root = os.path.join(tilelang_st_root, "npu", args.soc_version, "src", "st") + full_testcase_root = os.path.join(st_root, "testcase") + smoke_testcase_root = os.path.join(st_root, "smoke", "testcase") + if args.smoke: + testcase_root = smoke_testcase_root + target_dir = os.path.dirname(smoke_testcase_root) + else: + testcase_root = full_testcase_root + target_dir = st_root if not os.path.isdir(testcase_root): print(f"[ERROR] Testcase root not found: {testcase_root}", file=sys.stderr) @@ -213,17 +227,26 @@ def main(): run_st.set_env_variables(args.run_mode, default_soc_version) if not args.without_build: - build_target = "all" if selected_testcases == all_testcases else ";".join(selected_testcases) + if args.smoke: + build_target = selected_testcases[0] if len(selected_testcases) == 1 else "all" + else: + build_target = "all" if selected_testcases == all_testcases else ";".join(selected_testcases) print(f"[INFO] build requested for {build_target}") - run_st.build_project(args.run_mode, default_soc_version, "all", ptoas_bin) + run_st.build_project( + args.run_mode, + default_soc_version, + build_target if args.smoke else "all", + ptoas_bin, + ) total = len(selected_testcases) if args.jobs == 1: for index, testcase in enumerate(selected_testcases, start=1): case_filters = resolve_case_filters(testcase_root, testcase, args.smoke) + smoke_case_names = resolve_smoke_case_names(testcase_root, testcase, args.smoke) print(f"[INFO] [{index}/{total}] running testcase: {testcase}") - if case_filters: - print(f"[INFO] smoke cases: {', '.join(case_filters)}") + if smoke_case_names: + print(f"[INFO] smoke cases: {', '.join(smoke_case_names)}") try: run_st.run_gen_data(testcase, case_filters) run_st.run_binary(testcase, case_filters) @@ -241,15 +264,17 @@ def main(): future_to_testcase = {} for index, testcase in enumerate(selected_testcases, start=1): case_filters = resolve_case_filters(testcase_root, testcase, args.smoke) + smoke_case_names = resolve_smoke_case_names(testcase_root, testcase, args.smoke) print(f"[INFO] [{index}/{total}] queue testcase: {testcase}") - if case_filters: - print(f"[INFO] smoke cases: {', '.join(case_filters)}") + if smoke_case_names: + print(f"[INFO] smoke cases: {', '.join(smoke_case_names)}") future = executor.submit( run_testcase_subprocess, run_st_script_path, args.run_mode, args.soc_version, ptoas_bin, + target_dir, testcase, case_filters, ) diff --git a/test/tilelang_st/script/run_st.py b/test/tilelang_st/script/run_st.py index 44bf191ac..45826a692 100755 --- a/test/tilelang_st/script/run_st.py +++ b/test/tilelang_st/script/run_st.py @@ -104,6 +104,11 @@ def set_env_variables(run_mode, soc_version): simulator_lib_path = os.path.join( ascend_home, "tools", "simulator", soc_version, "lib" ) + sim_lib_dir = os.environ.get("SIM_LIB_DIR") + if sim_lib_dir: + simulator_lib_path = sim_lib_dir + os.environ["TILELANG_ST_SIM_LIB_DIR"] = simulator_lib_path + print(f"[INFO] SIM_LIB_DIR={simulator_lib_path}") os.environ["LD_LIBRARY_PATH"] = ( f"{simulator_lib_path}:{os.environ.get('LD_LIBRARY_PATH', '')}" ) @@ -268,6 +273,8 @@ def main(): help="Run one or more specific cases within the testcase. Can be passed multiple times.") parser.add_argument("-w", "--without-build", action="store_true", help="Skip build (requires prior build)") + parser.add_argument("--target-dir", required=False, + help="TileLang ST target directory. Defaults to npu//src/st.") args = parser.parse_args() @@ -292,7 +299,11 @@ def main(): try: script_path = os.path.abspath(__file__) tilelang_st_root = os.path.dirname(os.path.dirname(script_path)) - target_dir = os.path.join(tilelang_st_root, "npu", args.soc_version, "src", "st") + target_dir = args.target_dir or os.environ.get("TILELANG_ST_TARGET_DIR") + if target_dir: + target_dir = os.path.abspath(target_dir) + else: + target_dir = os.path.join(tilelang_st_root, "npu", args.soc_version, "src", "st") if not os.path.isdir(target_dir): print(f"[ERROR] Target dir not found: {target_dir}", file=sys.stderr) diff --git a/test/vpto/cases/micro-op/binary-vector/vadd-bf16/compare.py b/test/vpto/cases/micro-op/binary-vector/vadd-bf16/compare.py deleted file mode 100755 index 68ceff820..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vadd-bf16/compare.py +++ /dev/null @@ -1,42 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/binary-vector/vadd-bf16 -# family: binary-vector -# target_ops: pto.vadd -# scenarios: core-bf16, full-mask -import os -import sys -import numpy as np - - -def compare_bin_prefix(golden_path, output_path, dtype, eps, count): - if not os.path.exists(golden_path) or not os.path.exists(output_path): - return False - golden = np.fromfile(golden_path, dtype=dtype, count=count) - output = np.fromfile(output_path, dtype=dtype, count=count) - return golden.shape == output.shape and np.allclose( - golden, output, atol=eps, rtol=eps, equal_nan=True - ) - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_bin_prefix("golden_v3.bin", "v3.bin", np.uint16, 0, 1024) - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/binary-vector/vadd-bf16/golden.py b/test/vpto/cases/micro-op/binary-vector/vadd-bf16/golden.py deleted file mode 100755 index c1d417ba0..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vadd-bf16/golden.py +++ /dev/null @@ -1,61 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/binary-vector/vadd-bf16 -# family: binary-vector -# target_ops: pto.vadd -# scenarios: core-bf16, full-mask -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -ELEMS = 1024 -SEED = 19 - - -def f32_to_bf16_bits(values: np.ndarray) -> np.ndarray: - wide = values.astype(np.float32, copy=False).view(np.uint32) - rounding = np.uint32(0x7FFF) + ((wide >> 16) & np.uint32(1)) - return ((wide + rounding) >> 16).astype(np.uint16) - - -def bf16_bits_to_f32(bits: np.ndarray) -> np.ndarray: - return (bits.astype(np.uint32) << 16).view(np.float32) - - -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) - v1_f32 = rng.uniform(-4.0, 4.0, size=ELEMS).astype(np.float32) - v2_f32 = rng.uniform(-4.0, 4.0, size=ELEMS).astype(np.float32) - v1 = f32_to_bf16_bits(v1_f32) - v2 = f32_to_bf16_bits(v2_f32) - v3 = np.zeros(ELEMS, dtype=np.uint16) - golden_v3 = f32_to_bf16_bits(bf16_bits_to_f32(v1) + bf16_bits_to_f32(v2)) - - output_dir.mkdir(parents=True, exist_ok=True) - v1.tofile(output_dir / "v1.bin") - v2.tofile(output_dir / "v2.bin") - v3.tofile(output_dir / "v3.bin") - golden_v3.tofile(output_dir / "golden_v3.bin") - - -def main() -> None: - parser = argparse.ArgumentParser() - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/binary-vector/vadd-bf16/kernel.pto b/test/vpto/cases/micro-op/binary-vector/vadd-bf16/kernel.pto deleted file mode 100644 index 653d327fd..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vadd-bf16/kernel.pto +++ /dev/null @@ -1,53 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/binary-vector/vadd-bf16 -// family: binary-vector -// target_ops: pto.vadd -// scenarios: core-bf16, full-mask -// ----------------------------------------------------------------------------- -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vadd_bf16_kernel(%arg0: !pto.ptr, %arg1: !pto.ptr, - %arg2: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c128 = arith.constant 128 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c64_i64 = arith.constant 64 : i64 - %c2048_i64 = arith.constant 2048 : i64 - %c4096_i64 = arith.constant 4096 : i64 - - %ub_lhs = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_rhs = pto.castptr %c2048_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c4096_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_lhs, %c0_i64, %c64_i64 - nburst(%c32_i64, %c64_i64, %c64_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - pto.mte_gm_ub %arg1, %ub_rhs, %c0_i64, %c64_i64 - nburst(%c32_i64, %c64_i64, %c64_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %mask = pto.pset_b16 "PAT_ALL" : !pto.mask - scf.for %offset = %c0 to %c1024 step %c128 { - %lhs = pto.vlds %ub_lhs[%offset] : !pto.ptr -> !pto.vreg<128xbf16> - %rhs = pto.vlds %ub_rhs[%offset] : !pto.ptr -> !pto.vreg<128xbf16> - %sum = pto.vadd %lhs, %rhs, %mask : !pto.vreg<128xbf16>, !pto.vreg<128xbf16>, !pto.mask -> !pto.vreg<128xbf16> - pto.vsts %sum, %ub_out[%offset], %mask : !pto.vreg<128xbf16>, !pto.ptr, !pto.mask - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg2, %c64_i64 - nburst(%c32_i64, %c64_i64, %c64_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/binary-vector/vadd-bf16/launch.cpp b/test/vpto/cases/micro-op/binary-vector/vadd-bf16/launch.cpp deleted file mode 100644 index 13e50fe0b..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vadd-bf16/launch.cpp +++ /dev/null @@ -1,53 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/binary-vector/vadd-bf16 -// family: binary-vector -// target_ops: pto.vadd -// scenarios: core-bf16, full-mask -// ----------------------------------------------------------------------------- -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vadd_bf16_kernel(__gm__ bfloat16_t *v1, - __gm__ bfloat16_t *v2, - __gm__ bfloat16_t *v3); - -void LaunchVadd_bf16_kernel(uint16_t *v1, uint16_t *v2, uint16_t *v3, - void *stream) { - vadd_bf16_kernel<<<1, nullptr, stream>>>((__gm__ bfloat16_t *)v1, - (__gm__ bfloat16_t *)v2, - (__gm__ bfloat16_t *)v3); -} diff --git a/test/vpto/cases/micro-op/binary-vector/vadd-bf16/main.cpp b/test/vpto/cases/micro-op/binary-vector/vadd-bf16/main.cpp deleted file mode 100644 index e130fecc0..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vadd-bf16/main.cpp +++ /dev/null @@ -1,102 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/binary-vector/vadd-bf16 -// family: binary-vector -// target_ops: pto.vadd -// scenarios: core-bf16, full-mask -// ----------------------------------------------------------------------------- -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include -#include - -using namespace PtoTestCommon; - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, \ - (int)_ret, __FILE__, __LINE__); \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVadd_bf16_kernel(uint16_t *v1, uint16_t *v2, uint16_t *v3, - void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(uint16_t); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(uint16_t); - size_t elemCount_v3 = 1024; - size_t fileSize_v3 = elemCount_v3 * sizeof(uint16_t); - uint16_t *v1Host = nullptr; - uint16_t *v1Device = nullptr; - uint16_t *v2Host = nullptr; - uint16_t *v2Device = nullptr; - uint16_t *v3Host = nullptr; - uint16_t *v3Device = nullptr; - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) - deviceId = std::atoi(envDevice); - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMallocHost((void **)(&v3Host), fileSize_v3)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v3Device, fileSize_v3, ACL_MEM_MALLOC_HUGE_FIRST)); - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ReadFile("./v3.bin", fileSize_v3, v3Host, fileSize_v3); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v3Device, fileSize_v3, v3Host, fileSize_v3, - ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVadd_bf16_kernel(v1Device, v2Device, v3Device, stream); - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v3Host, fileSize_v3, v3Device, fileSize_v3, - ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v3.bin", v3Host, fileSize_v3); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFree(v3Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - aclrtFreeHost(v3Host); - if (stream != nullptr) - aclrtDestroyStream(stream); - if (deviceSet) - aclrtResetDevice(deviceId); - if (aclInited) - aclFinalize(); - return rc; -} diff --git a/test/vpto/cases/micro-op/binary-vector/vadd-f16/compare.py b/test/vpto/cases/micro-op/binary-vector/vadd-f16/compare.py deleted file mode 100755 index 1254044fb..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vadd-f16/compare.py +++ /dev/null @@ -1,42 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/binary-vector/vadd-f16 -# family: binary-vector -# target_ops: pto.vadd -# scenarios: core-f16, full-mask -import os -import sys -import numpy as np - - -def compare_bin_prefix(golden_path, output_path, dtype, eps, count): - if not os.path.exists(golden_path) or not os.path.exists(output_path): - return False - golden = np.fromfile(golden_path, dtype=dtype, count=count) - output = np.fromfile(output_path, dtype=dtype, count=count) - return golden.shape == output.shape and np.allclose( - golden, output, atol=eps, rtol=eps, equal_nan=True - ) - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_bin_prefix("golden_v3.bin", "v3.bin", np.float16, 5e-3, 1024) - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/binary-vector/vadd-f16/golden.py b/test/vpto/cases/micro-op/binary-vector/vadd-f16/golden.py deleted file mode 100755 index 442cc35e7..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vadd-f16/golden.py +++ /dev/null @@ -1,51 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/binary-vector/vadd-f16 -# family: binary-vector -# target_ops: pto.vadd -# scenarios: core-f16, full-mask -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -ROWS = 32 -COLS = 32 -SEED = 19 -LOGICAL_ELEMS = 1000 - - -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) - v1 = rng.uniform(-8.0, 8.0, size=(ROWS, COLS)).astype(np.float16) - v2 = rng.uniform(-8.0, 8.0, size=(ROWS, COLS)).astype(np.float16) - v3 = np.zeros((ROWS, COLS), dtype=np.float16) - golden_v3 = (v1.astype(np.float32) + v2.astype(np.float32)).astype(np.float16) - - output_dir.mkdir(parents=True, exist_ok=True) - v1.reshape(-1).tofile(output_dir / "v1.bin") - v2.reshape(-1).tofile(output_dir / "v2.bin") - v3.reshape(-1).tofile(output_dir / "v3.bin") - golden_v3.reshape(-1).tofile(output_dir / "golden_v3.bin") - - -def main() -> None: - parser = argparse.ArgumentParser() - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/binary-vector/vadd-f16/kernel.pto b/test/vpto/cases/micro-op/binary-vector/vadd-f16/kernel.pto deleted file mode 100644 index 705cb81ef..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vadd-f16/kernel.pto +++ /dev/null @@ -1,53 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/binary-vector/vadd-f16 -// family: binary-vector -// target_ops: pto.vadd -// scenarios: core-f16, full-mask -// ----------------------------------------------------------------------------- -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vadd_f16_kernel(%arg0: !pto.ptr, %arg1: !pto.ptr, - %arg2: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c128 = arith.constant 128 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c64_i64 = arith.constant 64 : i64 - %c2048_i64 = arith.constant 2048 : i64 - %c4096_i64 = arith.constant 4096 : i64 - - %ub_lhs = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_rhs = pto.castptr %c2048_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c4096_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_lhs, %c0_i64, %c64_i64 - nburst(%c32_i64, %c64_i64, %c64_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - pto.mte_gm_ub %arg1, %ub_rhs, %c0_i64, %c64_i64 - nburst(%c32_i64, %c64_i64, %c64_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %mask = pto.pset_b16 "PAT_ALL" : !pto.mask - scf.for %offset = %c0 to %c1024 step %c128 { - %lhs = pto.vlds %ub_lhs[%offset] : !pto.ptr -> !pto.vreg<128xf16> - %rhs = pto.vlds %ub_rhs[%offset] : !pto.ptr -> !pto.vreg<128xf16> - %sum = pto.vadd %lhs, %rhs, %mask : !pto.vreg<128xf16>, !pto.vreg<128xf16>, !pto.mask -> !pto.vreg<128xf16> - pto.vsts %sum, %ub_out[%offset], %mask : !pto.vreg<128xf16>, !pto.ptr, !pto.mask - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg2, %c64_i64 - nburst(%c32_i64, %c64_i64, %c64_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/binary-vector/vadd-f16/launch.cpp b/test/vpto/cases/micro-op/binary-vector/vadd-f16/launch.cpp deleted file mode 100644 index 8beb1a003..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vadd-f16/launch.cpp +++ /dev/null @@ -1,52 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/binary-vector/vadd-f16 -// family: binary-vector -// target_ops: pto.vadd -// scenarios: core-f16, full-mask -// ----------------------------------------------------------------------------- -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vadd_f16_kernel(__gm__ half *v1, - __gm__ half *v2, - __gm__ half *v3); - -void LaunchVadd_f16_kernel(uint16_t *v1, uint16_t *v2, uint16_t *v3, - void *stream) { - vadd_f16_kernel<<<1, nullptr, stream>>>((__gm__ half *)v1, (__gm__ half *)v2, - (__gm__ half *)v3); -} diff --git a/test/vpto/cases/micro-op/binary-vector/vadd-f16/main.cpp b/test/vpto/cases/micro-op/binary-vector/vadd-f16/main.cpp deleted file mode 100644 index 621cf398a..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vadd-f16/main.cpp +++ /dev/null @@ -1,101 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/binary-vector/vadd-f16 -// family: binary-vector -// target_ops: pto.vadd -// scenarios: core-f16, full-mask -// ----------------------------------------------------------------------------- -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include - -using namespace PtoTestCommon; - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, \ - (int)_ret, __FILE__, __LINE__); \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVadd_f16_kernel(uint16_t *v1, uint16_t *v2, uint16_t *v3, - void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(uint16_t); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(uint16_t); - size_t elemCount_v3 = 1024; - size_t fileSize_v3 = elemCount_v3 * sizeof(uint16_t); - uint16_t *v1Host = nullptr; - uint16_t *v1Device = nullptr; - uint16_t *v2Host = nullptr; - uint16_t *v2Device = nullptr; - uint16_t *v3Host = nullptr; - uint16_t *v3Device = nullptr; - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) - deviceId = std::atoi(envDevice); - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMallocHost((void **)(&v3Host), fileSize_v3)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v3Device, fileSize_v3, ACL_MEM_MALLOC_HUGE_FIRST)); - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ReadFile("./v3.bin", fileSize_v3, v3Host, fileSize_v3); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v3Device, fileSize_v3, v3Host, fileSize_v3, - ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVadd_f16_kernel(v1Device, v2Device, v3Device, stream); - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v3Host, fileSize_v3, v3Device, fileSize_v3, - ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v3.bin", v3Host, fileSize_v3); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFree(v3Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - aclrtFreeHost(v3Host); - if (stream != nullptr) - aclrtDestroyStream(stream); - if (deviceSet) - aclrtResetDevice(deviceId); - if (aclInited) - aclFinalize(); - return rc; -} diff --git a/test/vpto/cases/micro-op/binary-vector/vadd-f32-exceptional/compare.py b/test/vpto/cases/micro-op/binary-vector/vadd-f32-exceptional/compare.py deleted file mode 100644 index a5f14dabc..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vadd-f32-exceptional/compare.py +++ /dev/null @@ -1,37 +0,0 @@ -#!/usr/bin/python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - - -import os -import sys -import numpy as np - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(golden_path) or not os.path.exists(output_path): - return False - golden = np.fromfile(golden_path, dtype=dtype) - output = np.fromfile(output_path, dtype=dtype) - return golden.shape == output.shape and np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True) - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_bin("golden_v3.bin", "v3.bin", np.float32, 1e-4) - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/binary-vector/vadd-f32-exceptional/golden.py b/test/vpto/cases/micro-op/binary-vector/vadd-f32-exceptional/golden.py deleted file mode 100644 index 802880fdc..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vadd-f32-exceptional/golden.py +++ /dev/null @@ -1,54 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -ROWS = 32 -COLS = 32 -SEED = 19 - - -def generate(output_dir: Path, seed: int) -> None: - del seed - specials_a = np.array( - [-np.inf, -7.5, -0.0, 0.0, 1.0, np.inf, np.nan, 3.5], - dtype=np.float32, - ) - specials_b = np.array( - [np.inf, 2.5, 0.0, -0.0, -1.0, -np.inf, 1.0, np.nan], - dtype=np.float32, - ) - v1 = np.resize(specials_a, ROWS * COLS).reshape(ROWS, COLS).astype(np.float32) - v2 = np.resize(specials_b, ROWS * COLS).reshape(ROWS, COLS).astype(np.float32) - v3 = np.zeros((ROWS, COLS), dtype=np.float32) - golden_v3 = (v1 + v2).astype(np.float32, copy=False) - - output_dir.mkdir(parents=True, exist_ok=True) - v1.reshape(-1).tofile(output_dir / "v1.bin") - v2.reshape(-1).tofile(output_dir / "v2.bin") - v3.reshape(-1).tofile(output_dir / "v3.bin") - golden_v3.reshape(-1).tofile(output_dir / "golden_v3.bin") - - -def main() -> None: - parser = argparse.ArgumentParser() - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/binary-vector/vadd-f32-exceptional/kernel.pto b/test/vpto/cases/micro-op/binary-vector/vadd-f32-exceptional/kernel.pto deleted file mode 100644 index a3fbb8ef7..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vadd-f32-exceptional/kernel.pto +++ /dev/null @@ -1,49 +0,0 @@ -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vadd_f32_exceptional_kernel_2d(%arg0: !pto.ptr, %arg1: !pto.ptr, - %arg2: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c64 = arith.constant 64 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c128_i64 = arith.constant 128 : i64 - %c4096_i64 = arith.constant 4096 : i64 - %c8192_i64 = arith.constant 8192 : i64 - %c1024_i32 = arith.constant 1024 : i32 - - %ub_lhs = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_rhs = pto.castptr %c4096_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c8192_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_lhs, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - pto.mte_gm_ub %arg1, %ub_rhs, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %_:1 = scf.for %offset = %c0 to %c1024 step %c64 iter_args(%remaining = %c1024_i32) -> (i32) { - %mask, %next_remaining = pto.plt_b32 %remaining : i32 -> !pto.mask, i32 - %lhs = pto.vlds %ub_lhs[%offset] : !pto.ptr -> !pto.vreg<64xf32> - %rhs = pto.vlds %ub_rhs[%offset] : !pto.ptr -> !pto.vreg<64xf32> - %sum = pto.vadd %lhs, %rhs, %mask : !pto.vreg<64xf32>, !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> - pto.vsts %sum, %ub_out[%offset], %mask : !pto.vreg<64xf32>, !pto.ptr, !pto.mask - scf.yield %next_remaining : i32 - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg2, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/binary-vector/vadd-f32-exceptional/launch.cpp b/test/vpto/cases/micro-op/binary-vector/vadd-f32-exceptional/launch.cpp deleted file mode 100644 index fbb0031f2..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vadd-f32-exceptional/launch.cpp +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vadd_f32_exceptional_kernel_2d(__gm__ float *v1, - __gm__ float *v2, - __gm__ float *v3); - -void LaunchVadd_f32_exceptional_kernel_2d(float *v1, float *v2, float *v3, void *stream) { - vadd_f32_exceptional_kernel_2d<<<1, nullptr, stream>>>((__gm__ float *)v1, - (__gm__ float *)v2, - (__gm__ float *)v3); -} diff --git a/test/vpto/cases/micro-op/binary-vector/vadd-f32-exceptional/main.cpp b/test/vpto/cases/micro-op/binary-vector/vadd-f32-exceptional/main.cpp deleted file mode 100644 index 781e0d000..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vadd-f32-exceptional/main.cpp +++ /dev/null @@ -1,95 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include - -using namespace PtoTestCommon; - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, \ - (int)_ret, __FILE__, __LINE__); \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVadd_f32_exceptional_kernel_2d(float *v1, float *v2, float *v3, - void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(float); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(float); - size_t elemCount_v3 = 1024; - size_t fileSize_v3 = elemCount_v3 * sizeof(float); - float *v1Host = nullptr; - float *v1Device = nullptr; - float *v2Host = nullptr; - float *v2Device = nullptr; - float *v3Host = nullptr; - float *v3Device = nullptr; - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) - deviceId = std::atoi(envDevice); - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMallocHost((void **)(&v3Host), fileSize_v3)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v3Device, fileSize_v3, ACL_MEM_MALLOC_HUGE_FIRST)); - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ReadFile("./v3.bin", fileSize_v3, v3Host, fileSize_v3); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v3Device, fileSize_v3, v3Host, fileSize_v3, - ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVadd_f32_exceptional_kernel_2d(v1Device, v2Device, v3Device, stream); - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v3Host, fileSize_v3, v3Device, fileSize_v3, - ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v3.bin", v3Host, fileSize_v3); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFree(v3Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - aclrtFreeHost(v3Host); - if (stream != nullptr) - aclrtDestroyStream(stream); - if (deviceSet) - aclrtResetDevice(deviceId); - if (aclInited) - aclFinalize(); - return rc; -} diff --git a/test/vpto/cases/micro-op/binary-vector/vadd-i16-signed-overflow/compare.py b/test/vpto/cases/micro-op/binary-vector/vadd-i16-signed-overflow/compare.py deleted file mode 100644 index fe6bc69c3..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vadd-i16-signed-overflow/compare.py +++ /dev/null @@ -1,42 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/binary-vector/vadd-i16-signed-overflow -# family: binary-vector -# target_ops: pto.vadd -# scenarios: core-i16-signed, full-mask, integer-overflow - -import os -import sys - -import numpy as np - - -def compare_bin(golden_path: str, output_path: str, dtype) -> bool: - if not os.path.exists(golden_path) or not os.path.exists(output_path): - return False - golden = np.fromfile(golden_path, dtype=dtype) - output = np.fromfile(output_path, dtype=dtype) - return golden.shape == output.shape and np.array_equal(golden, output) - - -def main() -> None: - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_bin("golden_v3.bin", "v3.bin", np.int16) - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/binary-vector/vadd-i16-signed-overflow/golden.py b/test/vpto/cases/micro-op/binary-vector/vadd-i16-signed-overflow/golden.py deleted file mode 100644 index 960e6d163..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vadd-i16-signed-overflow/golden.py +++ /dev/null @@ -1,62 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/binary-vector/vadd-i16-signed-overflow -# family: binary-vector -# target_ops: pto.vadd -# scenarios: core-i16-signed, full-mask, integer-overflow - -import argparse -from pathlib import Path - -import numpy as np - - -ELEMS = 1024 -SEED = 19 - - -def wrap_add_i16(lhs: np.ndarray, rhs: np.ndarray) -> np.ndarray: - bits = lhs.view(np.uint16).astype(np.uint32) + rhs.view(np.uint16).astype(np.uint32) - return (bits & 0xFFFF).astype(np.uint16).view(np.int16) - - -def generate(output_dir: Path, seed: int) -> None: - del seed - lhs_pattern = np.array( - [32767, 32760, -32768, -32760, 1000, -1000, 12345, -12345], - dtype=np.int16, - ) - rhs_pattern = np.array( - [1, 100, -1, -100, 30000, -30000, 23456, -23456], - dtype=np.int16, - ) - repeats = ELEMS // lhs_pattern.size - v1 = np.tile(lhs_pattern, repeats) - v2 = np.tile(rhs_pattern, repeats) - v3 = np.zeros(ELEMS, dtype=np.int16) - golden_v3 = wrap_add_i16(v1, v2) - - output_dir.mkdir(parents=True, exist_ok=True) - v1.tofile(output_dir / "v1.bin") - v2.tofile(output_dir / "v2.bin") - v3.tofile(output_dir / "v3.bin") - golden_v3.tofile(output_dir / "golden_v3.bin") - - -def main() -> None: - parser = argparse.ArgumentParser() - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/binary-vector/vadd-i16-signed-overflow/kernel.pto b/test/vpto/cases/micro-op/binary-vector/vadd-i16-signed-overflow/kernel.pto deleted file mode 100644 index dafac0a22..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vadd-i16-signed-overflow/kernel.pto +++ /dev/null @@ -1,56 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/binary-vector/vadd-i16-signed-overflow -// family: binary-vector -// target_ops: pto.vadd -// scenarios: core-i16-signed, full-mask, integer-overflow -// ----------------------------------------------------------------------------- -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vadd_i16_signed_overflow_kernel(%arg0: !pto.ptr, - %arg1: !pto.ptr, - %arg2: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c128 = arith.constant 128 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c64_i64 = arith.constant 64 : i64 - %c2048_i64 = arith.constant 2048 : i64 - %c4096_i64 = arith.constant 4096 : i64 - %c1024_i32 = arith.constant 1024 : i32 - - %ub_lhs = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_rhs = pto.castptr %c2048_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c4096_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_lhs, %c0_i64, %c64_i64 - nburst(%c32_i64, %c64_i64, %c64_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - pto.mte_gm_ub %arg1, %ub_rhs, %c0_i64, %c64_i64 - nburst(%c32_i64, %c64_i64, %c64_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %_:1 = scf.for %offset = %c0 to %c1024 step %c128 iter_args(%remaining = %c1024_i32) -> (i32) { - %mask, %next_remaining = pto.plt_b16 %remaining : i32 -> !pto.mask, i32 - %lhs = pto.vlds %ub_lhs[%offset] : !pto.ptr -> !pto.vreg<128xi16> - %rhs = pto.vlds %ub_rhs[%offset] : !pto.ptr -> !pto.vreg<128xi16> - %sum = pto.vadd %lhs, %rhs, %mask : !pto.vreg<128xi16>, !pto.vreg<128xi16>, !pto.mask -> !pto.vreg<128xi16> - pto.vsts %sum, %ub_out[%offset], %mask : !pto.vreg<128xi16>, !pto.ptr, !pto.mask - scf.yield %next_remaining : i32 - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg2, %c64_i64 - nburst(%c32_i64, %c64_i64, %c64_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/binary-vector/vadd-i16-signed-overflow/launch.cpp b/test/vpto/cases/micro-op/binary-vector/vadd-i16-signed-overflow/launch.cpp deleted file mode 100644 index 7e3c8bb76..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vadd-i16-signed-overflow/launch.cpp +++ /dev/null @@ -1,47 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vadd_i16_signed_overflow_kernel(__gm__ int16_t *v1, - __gm__ int16_t *v2, - __gm__ int16_t *v3); - -void LaunchVadd_i16_signed_overflow_kernel(int16_t *v1, int16_t *v2, int16_t *v3, - void *stream) { - vadd_i16_signed_overflow_kernel<<<1, nullptr, stream>>>((__gm__ int16_t *)v1, - (__gm__ int16_t *)v2, - (__gm__ int16_t *)v3); -} diff --git a/test/vpto/cases/micro-op/binary-vector/vadd-i16-signed-overflow/main.cpp b/test/vpto/cases/micro-op/binary-vector/vadd-i16-signed-overflow/main.cpp deleted file mode 100644 index 26f3895f8..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vadd-i16-signed-overflow/main.cpp +++ /dev/null @@ -1,92 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -#include "test_common.h" -#include "acl/acl.h" -#include -#include -#include - -using namespace PtoTestCommon; - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, \ - (int)_ret, __FILE__, __LINE__); \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVadd_i16_signed_overflow_kernel(int16_t *v1, int16_t *v2, int16_t *v3, - void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(int16_t); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(int16_t); - size_t elemCount_v3 = 1024; - size_t fileSize_v3 = elemCount_v3 * sizeof(int16_t); - int16_t *v1Host = nullptr; - int16_t *v1Device = nullptr; - int16_t *v2Host = nullptr; - int16_t *v2Device = nullptr; - int16_t *v3Host = nullptr; - int16_t *v3Device = nullptr; - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) - deviceId = std::atoi(envDevice); - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMallocHost((void **)(&v3Host), fileSize_v3)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v3Device, fileSize_v3, ACL_MEM_MALLOC_HUGE_FIRST)); - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ReadFile("./v3.bin", fileSize_v3, v3Host, fileSize_v3); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v3Device, fileSize_v3, v3Host, fileSize_v3, - ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVadd_i16_signed_overflow_kernel(v1Device, v2Device, v3Device, stream); - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v3Host, fileSize_v3, v3Device, fileSize_v3, - ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v3.bin", v3Host, fileSize_v3); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFree(v3Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - aclrtFreeHost(v3Host); - if (stream != nullptr) - aclrtDestroyStream(stream); - if (deviceSet) - aclrtResetDevice(deviceId); - if (aclInited) - aclFinalize(); - return rc; -} diff --git a/test/vpto/cases/micro-op/binary-vector/vadd-i16-signed/compare.py b/test/vpto/cases/micro-op/binary-vector/vadd-i16-signed/compare.py deleted file mode 100755 index 2f49f90a6..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vadd-i16-signed/compare.py +++ /dev/null @@ -1,42 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/binary-vector/vadd-i16-signed -# family: binary-vector -# target_ops: pto.vadd -# scenarios: core-i16-signed, full-mask -import os -import sys -import numpy as np - - -def compare_bin_prefix(golden_path, output_path, dtype, eps, count): - if not os.path.exists(golden_path) or not os.path.exists(output_path): - return False - golden = np.fromfile(golden_path, dtype=dtype, count=count) - output = np.fromfile(output_path, dtype=dtype, count=count) - return golden.shape == output.shape and np.allclose( - golden, output, atol=eps, rtol=eps, equal_nan=True - ) - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_bin_prefix("golden_v3.bin", "v3.bin", np.int16, 0, 1024) - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/binary-vector/vadd-i16-signed/golden.py b/test/vpto/cases/micro-op/binary-vector/vadd-i16-signed/golden.py deleted file mode 100755 index 38079c47f..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vadd-i16-signed/golden.py +++ /dev/null @@ -1,48 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/binary-vector/vadd-i16-signed -# family: binary-vector -# target_ops: pto.vadd -# scenarios: core-i16-signed, full-mask -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -ROWS = 32 -COLS = 32 -SEED = 19 -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) - v1 = rng.integers(-1000, 1001, size=(ROWS, COLS), dtype=np.int16) - v2 = rng.integers(-1000, 1001, size=(ROWS, COLS), dtype=np.int16) - v3 = np.zeros((ROWS, COLS), dtype=np.int16) - golden_v3 = (v1.astype(np.int32) + v2.astype(np.int32)).astype(np.int16) - - output_dir.mkdir(parents=True, exist_ok=True) - v1.reshape(-1).tofile(output_dir / "v1.bin") - v2.reshape(-1).tofile(output_dir / "v2.bin") - v3.reshape(-1).tofile(output_dir / "v3.bin") - golden_v3.reshape(-1).tofile(output_dir / "golden_v3.bin") - - -def main() -> None: - parser = argparse.ArgumentParser() - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/binary-vector/vadd-i16-signed/kernel.pto b/test/vpto/cases/micro-op/binary-vector/vadd-i16-signed/kernel.pto deleted file mode 100644 index 83ecf0cfa..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vadd-i16-signed/kernel.pto +++ /dev/null @@ -1,53 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/binary-vector/vadd-i16-signed -// family: binary-vector -// target_ops: pto.vadd -// scenarios: core-i16-signed, full-mask -// ----------------------------------------------------------------------------- -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vadd_i16_signed_kernel(%arg0: !pto.ptr, %arg1: !pto.ptr, - %arg2: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1024 = arith.constant 1024 : index - %c128 = arith.constant 128 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c64_i64 = arith.constant 64 : i64 - %c2048_i64 = arith.constant 2048 : i64 - %c4096_i64 = arith.constant 4096 : i64 - - %ub_lhs = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_rhs = pto.castptr %c2048_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c4096_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_lhs, %c0_i64, %c64_i64 - nburst(%c32_i64, %c64_i64, %c64_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - pto.mte_gm_ub %arg1, %ub_rhs, %c0_i64, %c64_i64 - nburst(%c32_i64, %c64_i64, %c64_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %mask = pto.pset_b16 "PAT_ALL" : !pto.mask - scf.for %offset = %c0 to %c1024 step %c128 { - %lhs = pto.vlds %ub_lhs[%offset] : !pto.ptr -> !pto.vreg<128xi16> - %rhs = pto.vlds %ub_rhs[%offset] : !pto.ptr -> !pto.vreg<128xi16> - %sum = pto.vadd %lhs, %rhs, %mask : !pto.vreg<128xi16>, !pto.vreg<128xi16>, !pto.mask -> !pto.vreg<128xi16> - pto.vsts %sum, %ub_out[%offset], %mask : !pto.vreg<128xi16>, !pto.ptr, !pto.mask - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg2, %c64_i64 - nburst(%c32_i64, %c64_i64, %c64_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/binary-vector/vadd-i16-signed/launch.cpp b/test/vpto/cases/micro-op/binary-vector/vadd-i16-signed/launch.cpp deleted file mode 100644 index 5f2e4f059..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vadd-i16-signed/launch.cpp +++ /dev/null @@ -1,53 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/binary-vector/vadd-i16-signed -// family: binary-vector -// target_ops: pto.vadd -// scenarios: core-i16-signed, full-mask -// ----------------------------------------------------------------------------- -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vadd_i16_signed_kernel(__gm__ int16_t *v1, - __gm__ int16_t *v2, - __gm__ int16_t *v3); - -void LaunchVadd_i16_signed_kernel(int16_t *v1, int16_t *v2, int16_t *v3, - void *stream) { - vadd_i16_signed_kernel<<<1, nullptr, stream>>>((__gm__ int16_t *)v1, - (__gm__ int16_t *)v2, - (__gm__ int16_t *)v3); -} diff --git a/test/vpto/cases/micro-op/binary-vector/vadd-i16-signed/main.cpp b/test/vpto/cases/micro-op/binary-vector/vadd-i16-signed/main.cpp deleted file mode 100644 index 4d22c989d..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vadd-i16-signed/main.cpp +++ /dev/null @@ -1,102 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/binary-vector/vadd-i16-signed -// family: binary-vector -// target_ops: pto.vadd -// scenarios: core-i16-signed, full-mask -// ----------------------------------------------------------------------------- -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include -#include - -using namespace PtoTestCommon; - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, \ - (int)_ret, __FILE__, __LINE__); \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVadd_i16_signed_kernel(int16_t *v1, int16_t *v2, int16_t *v3, - void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(int16_t); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(int16_t); - size_t elemCount_v3 = 1024; - size_t fileSize_v3 = elemCount_v3 * sizeof(int16_t); - int16_t *v1Host = nullptr; - int16_t *v1Device = nullptr; - int16_t *v2Host = nullptr; - int16_t *v2Device = nullptr; - int16_t *v3Host = nullptr; - int16_t *v3Device = nullptr; - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) - deviceId = std::atoi(envDevice); - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMallocHost((void **)(&v3Host), fileSize_v3)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v3Device, fileSize_v3, ACL_MEM_MALLOC_HUGE_FIRST)); - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ReadFile("./v3.bin", fileSize_v3, v3Host, fileSize_v3); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v3Device, fileSize_v3, v3Host, fileSize_v3, - ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVadd_i16_signed_kernel(v1Device, v2Device, v3Device, stream); - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v3Host, fileSize_v3, v3Device, fileSize_v3, - ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v3.bin", v3Host, fileSize_v3); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFree(v3Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - aclrtFreeHost(v3Host); - if (stream != nullptr) - aclrtDestroyStream(stream); - if (deviceSet) - aclrtResetDevice(deviceId); - if (aclInited) - aclFinalize(); - return rc; -} diff --git a/test/vpto/cases/micro-op/binary-vector/vadd-i16-unsigned-overflow/compare.py b/test/vpto/cases/micro-op/binary-vector/vadd-i16-unsigned-overflow/compare.py deleted file mode 100644 index 4e992a275..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vadd-i16-unsigned-overflow/compare.py +++ /dev/null @@ -1,42 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/binary-vector/vadd-i16-unsigned-overflow -# family: binary-vector -# target_ops: pto.vadd -# scenarios: core-i16-unsigned, full-mask, integer-overflow - -import os -import sys - -import numpy as np - - -def compare_bin(golden_path: str, output_path: str, dtype) -> bool: - if not os.path.exists(golden_path) or not os.path.exists(output_path): - return False - golden = np.fromfile(golden_path, dtype=dtype) - output = np.fromfile(output_path, dtype=dtype) - return golden.shape == output.shape and np.array_equal(golden, output) - - -def main() -> None: - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_bin("golden_v3.bin", "v3.bin", np.uint16) - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/binary-vector/vadd-i16-unsigned-overflow/golden.py b/test/vpto/cases/micro-op/binary-vector/vadd-i16-unsigned-overflow/golden.py deleted file mode 100644 index 4673e2501..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vadd-i16-unsigned-overflow/golden.py +++ /dev/null @@ -1,62 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/binary-vector/vadd-i16-unsigned-overflow -# family: binary-vector -# target_ops: pto.vadd -# scenarios: core-i16-unsigned, full-mask, integer-overflow - -import argparse -from pathlib import Path - -import numpy as np - - -ELEMS = 1024 -SEED = 19 - - -def wrap_add_u16(lhs: np.ndarray, rhs: np.ndarray) -> np.ndarray: - wide = lhs.astype(np.uint32) + rhs.astype(np.uint32) - return (wide & 0xFFFF).astype(np.uint16) - - -def generate(output_dir: Path, seed: int) -> None: - del seed - lhs_pattern = np.array( - [65535, 65530, 65500, 60000, 100, 0, 32768, 12345], - dtype=np.uint16, - ) - rhs_pattern = np.array( - [1, 10, 1000, 10000, 65535, 5, 40000, 60000], - dtype=np.uint16, - ) - repeats = ELEMS // lhs_pattern.size - v1 = np.tile(lhs_pattern, repeats) - v2 = np.tile(rhs_pattern, repeats) - v3 = np.zeros(ELEMS, dtype=np.uint16) - golden_v3 = wrap_add_u16(v1, v2) - - output_dir.mkdir(parents=True, exist_ok=True) - v1.tofile(output_dir / "v1.bin") - v2.tofile(output_dir / "v2.bin") - v3.tofile(output_dir / "v3.bin") - golden_v3.tofile(output_dir / "golden_v3.bin") - - -def main() -> None: - parser = argparse.ArgumentParser() - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/binary-vector/vadd-i16-unsigned-overflow/kernel.pto b/test/vpto/cases/micro-op/binary-vector/vadd-i16-unsigned-overflow/kernel.pto deleted file mode 100644 index 9888837f9..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vadd-i16-unsigned-overflow/kernel.pto +++ /dev/null @@ -1,56 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/binary-vector/vadd-i16-unsigned-overflow -// family: binary-vector -// target_ops: pto.vadd -// scenarios: core-i16-unsigned, full-mask, integer-overflow -// ----------------------------------------------------------------------------- -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vadd_i16_unsigned_overflow_kernel(%arg0: !pto.ptr, - %arg1: !pto.ptr, - %arg2: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c128 = arith.constant 128 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c64_i64 = arith.constant 64 : i64 - %c2048_i64 = arith.constant 2048 : i64 - %c4096_i64 = arith.constant 4096 : i64 - %c1024_i32 = arith.constant 1024 : i32 - - %ub_lhs = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_rhs = pto.castptr %c2048_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c4096_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_lhs, %c0_i64, %c64_i64 - nburst(%c32_i64, %c64_i64, %c64_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - pto.mte_gm_ub %arg1, %ub_rhs, %c0_i64, %c64_i64 - nburst(%c32_i64, %c64_i64, %c64_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %_:1 = scf.for %offset = %c0 to %c1024 step %c128 iter_args(%remaining = %c1024_i32) -> (i32) { - %mask, %next_remaining = pto.plt_b16 %remaining : i32 -> !pto.mask, i32 - %lhs = pto.vlds %ub_lhs[%offset] : !pto.ptr -> !pto.vreg<128xui16> - %rhs = pto.vlds %ub_rhs[%offset] : !pto.ptr -> !pto.vreg<128xui16> - %sum = pto.vadd %lhs, %rhs, %mask : !pto.vreg<128xui16>, !pto.vreg<128xui16>, !pto.mask -> !pto.vreg<128xui16> - pto.vsts %sum, %ub_out[%offset], %mask : !pto.vreg<128xui16>, !pto.ptr, !pto.mask - scf.yield %next_remaining : i32 - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg2, %c64_i64 - nburst(%c32_i64, %c64_i64, %c64_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/binary-vector/vadd-i16-unsigned-overflow/launch.cpp b/test/vpto/cases/micro-op/binary-vector/vadd-i16-unsigned-overflow/launch.cpp deleted file mode 100644 index bfd0fbe37..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vadd-i16-unsigned-overflow/launch.cpp +++ /dev/null @@ -1,47 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vadd_i16_unsigned_overflow_kernel(__gm__ uint16_t *v1, - __gm__ uint16_t *v2, - __gm__ uint16_t *v3); - -void LaunchVadd_i16_unsigned_overflow_kernel(uint16_t *v1, uint16_t *v2, uint16_t *v3, - void *stream) { - vadd_i16_unsigned_overflow_kernel<<<1, nullptr, stream>>>((__gm__ uint16_t *)v1, - (__gm__ uint16_t *)v2, - (__gm__ uint16_t *)v3); -} diff --git a/test/vpto/cases/micro-op/binary-vector/vadd-i16-unsigned-overflow/main.cpp b/test/vpto/cases/micro-op/binary-vector/vadd-i16-unsigned-overflow/main.cpp deleted file mode 100644 index fb6fa53b2..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vadd-i16-unsigned-overflow/main.cpp +++ /dev/null @@ -1,92 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -#include "test_common.h" -#include "acl/acl.h" -#include -#include -#include - -using namespace PtoTestCommon; - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, \ - (int)_ret, __FILE__, __LINE__); \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVadd_i16_unsigned_overflow_kernel(uint16_t *v1, uint16_t *v2, uint16_t *v3, - void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(uint16_t); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(uint16_t); - size_t elemCount_v3 = 1024; - size_t fileSize_v3 = elemCount_v3 * sizeof(uint16_t); - uint16_t *v1Host = nullptr; - uint16_t *v1Device = nullptr; - uint16_t *v2Host = nullptr; - uint16_t *v2Device = nullptr; - uint16_t *v3Host = nullptr; - uint16_t *v3Device = nullptr; - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) - deviceId = std::atoi(envDevice); - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMallocHost((void **)(&v3Host), fileSize_v3)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v3Device, fileSize_v3, ACL_MEM_MALLOC_HUGE_FIRST)); - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ReadFile("./v3.bin", fileSize_v3, v3Host, fileSize_v3); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v3Device, fileSize_v3, v3Host, fileSize_v3, - ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVadd_i16_unsigned_overflow_kernel(v1Device, v2Device, v3Device, stream); - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v3Host, fileSize_v3, v3Device, fileSize_v3, - ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v3.bin", v3Host, fileSize_v3); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFree(v3Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - aclrtFreeHost(v3Host); - if (stream != nullptr) - aclrtDestroyStream(stream); - if (deviceSet) - aclrtResetDevice(deviceId); - if (aclInited) - aclFinalize(); - return rc; -} diff --git a/test/vpto/cases/micro-op/binary-vector/vadd-i16-unsigned/compare.py b/test/vpto/cases/micro-op/binary-vector/vadd-i16-unsigned/compare.py deleted file mode 100755 index 29c833e93..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vadd-i16-unsigned/compare.py +++ /dev/null @@ -1,42 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/binary-vector/vadd-i16-unsigned -# family: binary-vector -# target_ops: pto.vadd -# scenarios: core-i16-unsigned, full-mask -import os -import sys -import numpy as np - - -def compare_bin_prefix(golden_path, output_path, dtype, eps, count): - if not os.path.exists(golden_path) or not os.path.exists(output_path): - return False - golden = np.fromfile(golden_path, dtype=dtype, count=count) - output = np.fromfile(output_path, dtype=dtype, count=count) - return golden.shape == output.shape and np.allclose( - golden, output, atol=eps, rtol=eps, equal_nan=True - ) - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_bin_prefix("golden_v3.bin", "v3.bin", np.uint16, 0, 1024) - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/binary-vector/vadd-i16-unsigned/golden.py b/test/vpto/cases/micro-op/binary-vector/vadd-i16-unsigned/golden.py deleted file mode 100755 index fa3e8e0c1..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vadd-i16-unsigned/golden.py +++ /dev/null @@ -1,48 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/binary-vector/vadd-i16-unsigned -# family: binary-vector -# target_ops: pto.vadd -# scenarios: core-i16-unsigned, full-mask -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -ROWS = 32 -COLS = 32 -SEED = 19 -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) - v1 = rng.integers(0, 2001, size=(ROWS, COLS), dtype=np.uint16) - v2 = rng.integers(0, 2001, size=(ROWS, COLS), dtype=np.uint16) - v3 = np.zeros((ROWS, COLS), dtype=np.uint16) - golden_v3 = (v1.astype(np.uint32) + v2.astype(np.uint32)).astype(np.uint16) - - output_dir.mkdir(parents=True, exist_ok=True) - v1.reshape(-1).tofile(output_dir / "v1.bin") - v2.reshape(-1).tofile(output_dir / "v2.bin") - v3.reshape(-1).tofile(output_dir / "v3.bin") - golden_v3.reshape(-1).tofile(output_dir / "golden_v3.bin") - - -def main() -> None: - parser = argparse.ArgumentParser() - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/binary-vector/vadd-i16-unsigned/kernel.pto b/test/vpto/cases/micro-op/binary-vector/vadd-i16-unsigned/kernel.pto deleted file mode 100644 index ddbeecdf3..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vadd-i16-unsigned/kernel.pto +++ /dev/null @@ -1,53 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/binary-vector/vadd-i16-unsigned -// family: binary-vector -// target_ops: pto.vadd -// scenarios: core-i16-unsigned, full-mask -// ----------------------------------------------------------------------------- -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vadd_i16_unsigned_kernel(%arg0: !pto.ptr, %arg1: !pto.ptr, - %arg2: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1024 = arith.constant 1024 : index - %c128 = arith.constant 128 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c64_i64 = arith.constant 64 : i64 - %c2048_i64 = arith.constant 2048 : i64 - %c4096_i64 = arith.constant 4096 : i64 - - %ub_lhs = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_rhs = pto.castptr %c2048_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c4096_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_lhs, %c0_i64, %c64_i64 - nburst(%c32_i64, %c64_i64, %c64_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - pto.mte_gm_ub %arg1, %ub_rhs, %c0_i64, %c64_i64 - nburst(%c32_i64, %c64_i64, %c64_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %mask = pto.pset_b16 "PAT_ALL" : !pto.mask - scf.for %offset = %c0 to %c1024 step %c128 { - %lhs = pto.vlds %ub_lhs[%offset] : !pto.ptr -> !pto.vreg<128xui16> - %rhs = pto.vlds %ub_rhs[%offset] : !pto.ptr -> !pto.vreg<128xui16> - %sum = pto.vadd %lhs, %rhs, %mask : !pto.vreg<128xui16>, !pto.vreg<128xui16>, !pto.mask -> !pto.vreg<128xui16> - pto.vsts %sum, %ub_out[%offset], %mask : !pto.vreg<128xui16>, !pto.ptr, !pto.mask - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg2, %c64_i64 - nburst(%c32_i64, %c64_i64, %c64_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/binary-vector/vadd-i16-unsigned/launch.cpp b/test/vpto/cases/micro-op/binary-vector/vadd-i16-unsigned/launch.cpp deleted file mode 100644 index c4198a017..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vadd-i16-unsigned/launch.cpp +++ /dev/null @@ -1,53 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/binary-vector/vadd-i16-unsigned -// family: binary-vector -// target_ops: pto.vadd -// scenarios: core-i16-unsigned, full-mask -// ----------------------------------------------------------------------------- -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vadd_i16_unsigned_kernel(__gm__ uint16_t *v1, - __gm__ uint16_t *v2, - __gm__ uint16_t *v3); - -void LaunchVadd_i16_unsigned_kernel(uint16_t *v1, uint16_t *v2, uint16_t *v3, - void *stream) { - vadd_i16_unsigned_kernel<<<1, nullptr, stream>>>((__gm__ uint16_t *)v1, - (__gm__ uint16_t *)v2, - (__gm__ uint16_t *)v3); -} diff --git a/test/vpto/cases/micro-op/binary-vector/vadd-i16-unsigned/main.cpp b/test/vpto/cases/micro-op/binary-vector/vadd-i16-unsigned/main.cpp deleted file mode 100644 index dd05d5051..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vadd-i16-unsigned/main.cpp +++ /dev/null @@ -1,102 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/binary-vector/vadd-i16-unsigned -// family: binary-vector -// target_ops: pto.vadd -// scenarios: core-i16-unsigned, full-mask -// ----------------------------------------------------------------------------- -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include -#include - -using namespace PtoTestCommon; - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, \ - (int)_ret, __FILE__, __LINE__); \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVadd_i16_unsigned_kernel(uint16_t *v1, uint16_t *v2, uint16_t *v3, - void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(uint16_t); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(uint16_t); - size_t elemCount_v3 = 1024; - size_t fileSize_v3 = elemCount_v3 * sizeof(uint16_t); - uint16_t *v1Host = nullptr; - uint16_t *v1Device = nullptr; - uint16_t *v2Host = nullptr; - uint16_t *v2Device = nullptr; - uint16_t *v3Host = nullptr; - uint16_t *v3Device = nullptr; - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) - deviceId = std::atoi(envDevice); - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMallocHost((void **)(&v3Host), fileSize_v3)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v3Device, fileSize_v3, ACL_MEM_MALLOC_HUGE_FIRST)); - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ReadFile("./v3.bin", fileSize_v3, v3Host, fileSize_v3); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v3Device, fileSize_v3, v3Host, fileSize_v3, - ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVadd_i16_unsigned_kernel(v1Device, v2Device, v3Device, stream); - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v3Host, fileSize_v3, v3Device, fileSize_v3, - ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v3.bin", v3Host, fileSize_v3); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFree(v3Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - aclrtFreeHost(v3Host); - if (stream != nullptr) - aclrtDestroyStream(stream); - if (deviceSet) - aclrtResetDevice(deviceId); - if (aclInited) - aclFinalize(); - return rc; -} diff --git a/test/vpto/cases/micro-op/binary-vector/vadd-tail/compare.py b/test/vpto/cases/micro-op/binary-vector/vadd-tail/compare.py deleted file mode 100644 index c95419953..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vadd-tail/compare.py +++ /dev/null @@ -1,39 +0,0 @@ -#!/usr/bin/python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - - -import os -import sys -import numpy as np - - -def compare_bin_prefix(golden_path, output_path, dtype, eps, count): - if not os.path.exists(golden_path) or not os.path.exists(output_path): - return False - golden = np.fromfile(golden_path, dtype=dtype, count=count) - output = np.fromfile(output_path, dtype=dtype, count=count) - return golden.shape == output.shape and np.allclose( - golden, output, atol=eps, rtol=eps, equal_nan=True - ) - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_bin_prefix("golden_v3.bin", "v3.bin", np.float32, 1e-4, 1000) - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/binary-vector/vadd-tail/golden.py b/test/vpto/cases/micro-op/binary-vector/vadd-tail/golden.py deleted file mode 100644 index e967b1153..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vadd-tail/golden.py +++ /dev/null @@ -1,51 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -ROWS = 32 -COLS = 32 -SEED = 19 -LOGICAL_ELEMS = 1000 -OUT_SENTINEL = np.float32(-123.25) - - -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) - v1 = rng.random((ROWS, COLS), dtype=np.float32) - v2 = rng.random((ROWS, COLS), dtype=np.float32) - v3 = np.full((ROWS, COLS), OUT_SENTINEL, dtype=np.float32) - golden_v3 = np.full((ROWS, COLS), OUT_SENTINEL, dtype=np.float32) - golden_v3.reshape(-1)[:LOGICAL_ELEMS] = ( - v1.reshape(-1)[:LOGICAL_ELEMS] + v2.reshape(-1)[:LOGICAL_ELEMS] - ).astype(np.float32, copy=False) - - output_dir.mkdir(parents=True, exist_ok=True) - v1.reshape(-1).tofile(output_dir / "v1.bin") - v2.reshape(-1).tofile(output_dir / "v2.bin") - v3.reshape(-1).tofile(output_dir / "v3.bin") - golden_v3.reshape(-1).tofile(output_dir / "golden_v3.bin") - - -def main() -> None: - parser = argparse.ArgumentParser() - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/binary-vector/vadd-tail/kernel.pto b/test/vpto/cases/micro-op/binary-vector/vadd-tail/kernel.pto deleted file mode 100644 index 599f6da84..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vadd-tail/kernel.pto +++ /dev/null @@ -1,49 +0,0 @@ -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vadd_tail_kernel_2d(%arg0: !pto.ptr, %arg1: !pto.ptr, - %arg2: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c64 = arith.constant 64 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c128_i64 = arith.constant 128 : i64 - %c4096_i64 = arith.constant 4096 : i64 - %c8192_i64 = arith.constant 8192 : i64 - %c1000_i32 = arith.constant 1000 : i32 - - %ub_lhs = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_rhs = pto.castptr %c4096_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c8192_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_lhs, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - pto.mte_gm_ub %arg1, %ub_rhs, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %_:1 = scf.for %offset = %c0 to %c1024 step %c64 iter_args(%remaining = %c1000_i32) -> (i32) { - %mask, %next_remaining = pto.plt_b32 %remaining : i32 -> !pto.mask, i32 - %lhs = pto.vlds %ub_lhs[%offset] : !pto.ptr -> !pto.vreg<64xf32> - %rhs = pto.vlds %ub_rhs[%offset] : !pto.ptr -> !pto.vreg<64xf32> - %sum = pto.vadd %lhs, %rhs, %mask : !pto.vreg<64xf32>, !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> - pto.vsts %sum, %ub_out[%offset], %mask : !pto.vreg<64xf32>, !pto.ptr, !pto.mask - scf.yield %next_remaining : i32 - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg2, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/binary-vector/vadd-tail/launch.cpp b/test/vpto/cases/micro-op/binary-vector/vadd-tail/launch.cpp deleted file mode 100644 index 3d1578331..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vadd-tail/launch.cpp +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vadd_tail_kernel_2d(__gm__ float *v1, - __gm__ float *v2, - __gm__ float *v3); - -void LaunchVadd_tail_kernel_2d(float *v1, float *v2, float *v3, void *stream) { - vadd_tail_kernel_2d<<<1, nullptr, stream>>>((__gm__ float *)v1, - (__gm__ float *)v2, - (__gm__ float *)v3); -} diff --git a/test/vpto/cases/micro-op/binary-vector/vadd-tail/main.cpp b/test/vpto/cases/micro-op/binary-vector/vadd-tail/main.cpp deleted file mode 100644 index 40a9881d6..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vadd-tail/main.cpp +++ /dev/null @@ -1,94 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include - -using namespace PtoTestCommon; - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, \ - (int)_ret, __FILE__, __LINE__); \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVadd_tail_kernel_2d(float *v1, float *v2, float *v3, void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(float); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(float); - size_t elemCount_v3 = 1024; - size_t fileSize_v3 = elemCount_v3 * sizeof(float); - float *v1Host = nullptr; - float *v1Device = nullptr; - float *v2Host = nullptr; - float *v2Device = nullptr; - float *v3Host = nullptr; - float *v3Device = nullptr; - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) - deviceId = std::atoi(envDevice); - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMallocHost((void **)(&v3Host), fileSize_v3)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v3Device, fileSize_v3, ACL_MEM_MALLOC_HUGE_FIRST)); - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ReadFile("./v3.bin", fileSize_v3, v3Host, fileSize_v3); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v3Device, fileSize_v3, v3Host, fileSize_v3, - ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVadd_tail_kernel_2d(v1Device, v2Device, v3Device, stream); - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v3Host, fileSize_v3, v3Device, fileSize_v3, - ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v3.bin", v3Host, fileSize_v3); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFree(v3Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - aclrtFreeHost(v3Host); - if (stream != nullptr) - aclrtDestroyStream(stream); - if (deviceSet) - aclrtResetDevice(deviceId); - if (aclInited) - aclFinalize(); - return rc; -} diff --git a/test/vpto/cases/micro-op/binary-vector/vadd/compare.py b/test/vpto/cases/micro-op/binary-vector/vadd/compare.py index a5f14dabc..518ab8032 100644 --- a/test/vpto/cases/micro-op/binary-vector/vadd/compare.py +++ b/test/vpto/cases/micro-op/binary-vector/vadd/compare.py @@ -7,30 +7,71 @@ # INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. # See LICENSE in the root of the software repository for the full text of the License. +# Merged vadd compare: checks all 9 variants. import os import sys import numpy as np -def compare_bin(golden_path, output_path, dtype, eps): +def compare_bin(golden_path, output_path, dtype, eps, count=-1): + if not os.path.exists(golden_path) or not os.path.exists(output_path): + return False + kw = {} if count < 0 else {"count": count} + golden = np.fromfile(golden_path, dtype=dtype, **kw) + output = np.fromfile(output_path, dtype=dtype, **kw) + return golden.shape == output.shape and np.allclose( + golden, output, atol=eps, rtol=eps, equal_nan=True + ) + + +def compare_bin_exact(golden_path, output_path, dtype): if not os.path.exists(golden_path) or not os.path.exists(output_path): return False golden = np.fromfile(golden_path, dtype=dtype) output = np.fromfile(output_path, dtype=dtype) - return golden.shape == output.shape and np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True) + return golden.shape == output.shape and np.array_equal(golden, output) def main(): strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_bin("golden_v3.bin", "v3.bin", np.float32, 1e-4) - if not ok: + + # (golden, output, dtype, eps, count, label) + checks = [ + ("golden_v3.bin", "v3.bin", np.float32, 1e-4, -1, "f32"), + ("golden_v3_f16.bin", "v3_f16.bin", np.float16, 5e-3, 1024, "f16"), + ("golden_v3_bf16.bin", "v3_bf16.bin", np.uint16, 0, 1024, "bf16"), + ("golden_v3_x.bin", "v3_x.bin", np.float32, 1e-4, -1, "f32-exceptional"), + ("golden_v3_i16s.bin", "v3_i16s.bin", np.int16, 0, 1024, "i16-signed"), + ("golden_v3_tail.bin", "v3_tail.bin", np.float32, 1e-4, 1000, "tail"), + ] + # Overflow variants need exact match (wrapping arithmetic) + checks_exact = [ + ("golden_v3_i16s_ov.bin", "v3_i16s_ov.bin", np.int16, "i16-signed-overflow"), + ("golden_v3_i16u.bin", "v3_i16u.bin", np.uint16, "i16-unsigned"), + ("golden_v3_i16u_ov.bin", "v3_i16u_ov.bin", np.uint16, "i16-unsigned-overflow"), + ] + + failed = [] + for golden, output, dtype, eps, count, label in checks: + ok = compare_bin(golden, output, dtype, eps, count) + if not ok: + failed.append(label) + print(f"[ERROR] compare failed: {label}") + + for golden, output, dtype, label in checks_exact: + ok = compare_bin_exact(golden, output, dtype) + if not ok: + failed.append(label) + print(f"[ERROR] compare failed (exact): {label}") + + if failed: if strict: - print("[ERROR] compare failed") + print(f"[ERROR] {len(failed)} variant(s) failed: {', '.join(failed)}") sys.exit(2) - print("[WARN] compare failed (non-gating)") + print(f"[WARN] {len(failed)} variant(s) failed (non-gating): {', '.join(failed)}") return - print("[INFO] compare passed") + print("[INFO] compare passed (all 9 variants)") if __name__ == "__main__": diff --git a/test/vpto/cases/micro-op/binary-vector/vadd/golden.py b/test/vpto/cases/micro-op/binary-vector/vadd/golden.py index fbf37245e..3a550b187 100644 --- a/test/vpto/cases/micro-op/binary-vector/vadd/golden.py +++ b/test/vpto/cases/micro-op/binary-vector/vadd/golden.py @@ -7,31 +7,173 @@ # INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. # See LICENSE in the root of the software repository for the full text of the License. -# coding=utf-8 +# Merged vadd golden data generator: 9 variants. +# Each variant writes uniquely suffixed .bin files. import argparse from pathlib import Path import numpy as np - ROWS = 32 COLS = 32 SEED = 19 +LOGICAL_ELEMS = 1000 +OUT_SENTINEL = np.float32(-123.25) + + +# ---- helpers ---- + +def f32_to_bf16_bits(values: np.ndarray) -> np.ndarray: + wide = values.astype(np.float32, copy=False).view(np.uint32) + rounding = np.uint32(0x7FFF) + ((wide >> 16) & np.uint32(1)) + return ((wide + rounding) >> 16).astype(np.uint16) + + +def bf16_bits_to_f32(bits: np.ndarray) -> np.ndarray: + return (bits.astype(np.uint32) << 16).view(np.float32) + + +def wrap_add_i16(lhs: np.ndarray, rhs: np.ndarray) -> np.ndarray: + bits = lhs.view(np.uint16).astype(np.uint32) + rhs.view(np.uint16).astype(np.uint32) + return (bits & 0xFFFF).astype(np.uint16).view(np.int16) -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) +def wrap_add_u16(lhs: np.ndarray, rhs: np.ndarray) -> np.ndarray: + wide = lhs.astype(np.uint32) + rhs.astype(np.uint32) + return (wide & 0xFFFF).astype(np.uint16) + + +# ---- generators ---- + +def gen_f32(out: Path, rng: np.random.Generator) -> None: v1 = rng.uniform(-8.0, 8.0, size=(ROWS, COLS)).astype(np.float32) v2 = rng.uniform(-8.0, 8.0, size=(ROWS, COLS)).astype(np.float32) - golden_v3 = (v1 + v2).astype(np.float32, copy=False) + g = (v1 + v2).astype(np.float32, copy=False) + v3 = np.zeros((ROWS, COLS), dtype=np.float32) + v1.reshape(-1).tofile(out / "v1.bin") + v2.reshape(-1).tofile(out / "v2.bin") + v3.reshape(-1).tofile(out / "v3.bin") + g.reshape(-1).tofile(out / "golden_v3.bin") + + +def gen_f16(out: Path, rng: np.random.Generator) -> None: + v1 = rng.uniform(-8.0, 8.0, size=(ROWS, COLS)).astype(np.float16) + v2 = rng.uniform(-8.0, 8.0, size=(ROWS, COLS)).astype(np.float16) + v3 = np.zeros((ROWS, COLS), dtype=np.float16) + g = (v1.astype(np.float32) + v2.astype(np.float32)).astype(np.float16) + v1.reshape(-1).tofile(out / "v1_f16.bin") + v2.reshape(-1).tofile(out / "v2_f16.bin") + v3.reshape(-1).tofile(out / "v3_f16.bin") + g.reshape(-1).tofile(out / "golden_v3_f16.bin") + + +def gen_bf16(out: Path, rng: np.random.Generator) -> None: + elems = ROWS * COLS + v1_f32 = rng.uniform(-4.0, 4.0, size=elems).astype(np.float32) + v2_f32 = rng.uniform(-4.0, 4.0, size=elems).astype(np.float32) + v1 = f32_to_bf16_bits(v1_f32) + v2 = f32_to_bf16_bits(v2_f32) + v3 = np.zeros(elems, dtype=np.uint16) + g = f32_to_bf16_bits(bf16_bits_to_f32(v1) + bf16_bits_to_f32(v2)) + v1.tofile(out / "v1_bf16.bin") + v2.tofile(out / "v2_bf16.bin") + v3.tofile(out / "v3_bf16.bin") + g.tofile(out / "golden_v3_bf16.bin") + + +def gen_f32_exceptional(out: Path, _rng: np.random.Generator) -> None: + specials_a = np.array([-np.inf, -7.5, -0.0, 0.0, 1.0, np.inf, np.nan, 3.5], dtype=np.float32) + specials_b = np.array([np.inf, 2.5, 0.0, -0.0, -1.0, -np.inf, 1.0, np.nan], dtype=np.float32) + v1 = np.resize(specials_a, ROWS * COLS).reshape(ROWS, COLS).astype(np.float32) + v2 = np.resize(specials_b, ROWS * COLS).reshape(ROWS, COLS).astype(np.float32) v3 = np.zeros((ROWS, COLS), dtype=np.float32) + g = (v1 + v2).astype(np.float32, copy=False) + v1.reshape(-1).tofile(out / "v1_x.bin") + v2.reshape(-1).tofile(out / "v2_x.bin") + v3.reshape(-1).tofile(out / "v3_x.bin") + g.reshape(-1).tofile(out / "golden_v3_x.bin") + + +def gen_i16_signed(out: Path, rng: np.random.Generator) -> None: + v1 = rng.integers(-1000, 1001, size=(ROWS, COLS), dtype=np.int16) + v2 = rng.integers(-1000, 1001, size=(ROWS, COLS), dtype=np.int16) + v3 = np.zeros((ROWS, COLS), dtype=np.int16) + g = (v1.astype(np.int32) + v2.astype(np.int32)).astype(np.int16) + v1.reshape(-1).tofile(out / "v1_i16s.bin") + v2.reshape(-1).tofile(out / "v2_i16s.bin") + v3.reshape(-1).tofile(out / "v3_i16s.bin") + g.reshape(-1).tofile(out / "golden_v3_i16s.bin") + + +def gen_i16_signed_overflow(out: Path, _rng: np.random.Generator) -> None: + elems = ROWS * COLS + lhs_pattern = np.array([32767, 32760, -32768, -32760, 1000, -1000, 12345, -12345], dtype=np.int16) + rhs_pattern = np.array([1, 100, -1, -100, 30000, -30000, 23456, -23456], dtype=np.int16) + repeats = elems // lhs_pattern.size + v1 = np.tile(lhs_pattern, repeats) + v2 = np.tile(rhs_pattern, repeats) + v3 = np.zeros(elems, dtype=np.int16) + g = wrap_add_i16(v1, v2) + v1.tofile(out / "v1_i16s_ov.bin") + v2.tofile(out / "v2_i16s_ov.bin") + v3.tofile(out / "v3_i16s_ov.bin") + g.tofile(out / "golden_v3_i16s_ov.bin") + + +def gen_i16_unsigned(out: Path, rng: np.random.Generator) -> None: + v1 = rng.integers(0, 2001, size=(ROWS, COLS), dtype=np.uint16) + v2 = rng.integers(0, 2001, size=(ROWS, COLS), dtype=np.uint16) + v3 = np.zeros((ROWS, COLS), dtype=np.uint16) + g = (v1.astype(np.uint32) + v2.astype(np.uint32)).astype(np.uint16) + v1.reshape(-1).tofile(out / "v1_i16u.bin") + v2.reshape(-1).tofile(out / "v2_i16u.bin") + v3.reshape(-1).tofile(out / "v3_i16u.bin") + g.reshape(-1).tofile(out / "golden_v3_i16u.bin") + + +def gen_i16_unsigned_overflow(out: Path, _rng: np.random.Generator) -> None: + elems = ROWS * COLS + lhs_pattern = np.array([65535, 65530, 65500, 60000, 100, 0, 32768, 12345], dtype=np.uint16) + rhs_pattern = np.array([1, 10, 1000, 10000, 65535, 5, 40000, 60000], dtype=np.uint16) + repeats = elems // lhs_pattern.size + v1 = np.tile(lhs_pattern, repeats) + v2 = np.tile(rhs_pattern, repeats) + v3 = np.zeros(elems, dtype=np.uint16) + g = wrap_add_u16(v1, v2) + v1.tofile(out / "v1_i16u_ov.bin") + v2.tofile(out / "v2_i16u_ov.bin") + v3.tofile(out / "v3_i16u_ov.bin") + g.tofile(out / "golden_v3_i16u_ov.bin") + + +def gen_tail(out: Path, rng: np.random.Generator) -> None: + v1 = rng.random((ROWS, COLS), dtype=np.float32) + v2 = rng.random((ROWS, COLS), dtype=np.float32) + v3 = np.full((ROWS, COLS), OUT_SENTINEL, dtype=np.float32) + g = np.full((ROWS, COLS), OUT_SENTINEL, dtype=np.float32) + g.reshape(-1)[:LOGICAL_ELEMS] = ( + v1.reshape(-1)[:LOGICAL_ELEMS] + v2.reshape(-1)[:LOGICAL_ELEMS] + ).astype(np.float32, copy=False) + v1.reshape(-1).tofile(out / "v1_tail.bin") + v2.reshape(-1).tofile(out / "v2_tail.bin") + v3.reshape(-1).tofile(out / "v3_tail.bin") + g.reshape(-1).tofile(out / "golden_v3_tail.bin") + + +# ---- main ---- - output_dir.mkdir(parents=True, exist_ok=True) - v1.reshape(-1).tofile(output_dir / "v1.bin") - v2.reshape(-1).tofile(output_dir / "v2.bin") - v3.reshape(-1).tofile(output_dir / "v3.bin") - golden_v3.reshape(-1).tofile(output_dir / "golden_v3.bin") +GENERATORS = [ + gen_f32, + gen_f16, + gen_bf16, + gen_f32_exceptional, + gen_i16_signed, + gen_i16_signed_overflow, + gen_i16_unsigned, + gen_i16_unsigned_overflow, + gen_tail, +] def main() -> None: @@ -39,7 +181,11 @@ def main() -> None: parser.add_argument("--output-dir", type=Path, default=Path(".")) parser.add_argument("--seed", type=int, default=SEED) args = parser.parse_args() - generate(args.output_dir, args.seed) + rng = np.random.default_rng(args.seed) + out = args.output_dir + out.mkdir(parents=True, exist_ok=True) + for gen in GENERATORS: + gen(out, rng) if __name__ == "__main__": diff --git a/test/vpto/cases/micro-op/binary-vector/vadd/kernel.pto b/test/vpto/cases/micro-op/binary-vector/vadd/kernel.pto index 2aa6dbfe0..9df0168b9 100644 --- a/test/vpto/cases/micro-op/binary-vector/vadd/kernel.pto +++ b/test/vpto/cases/micro-op/binary-vector/vadd/kernel.pto @@ -1,32 +1,45 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// Merged vadd kernel: 9 variants (f32, f16, bf16, f32-exceptional, +// i16-signed, i16-signed-overflow, i16-unsigned, i16-unsigned-overflow, tail) module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @add_kernel_2d(%arg0: !pto.ptr, %arg1: !pto.ptr, - %arg2: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c64 = arith.constant 64 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c2_i64 = arith.constant 2 : i64 - %c32_i64 = arith.constant 32 : i64 - %c128_i64 = arith.constant 128 : i64 - %c4096_i64 = arith.constant 4096 : i64 - %c8192_i64 = arith.constant 8192 : i64 - %c1024_i32 = arith.constant 1024 : i32 - - %ub_lhs = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_rhs = pto.castptr %c4096_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c8192_i64 : i64 -> !pto.ptr - - %false = arith.constant false + + // --- f32 --- + func.func @vadd_deep_merged_kernel(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: !pto.ptr, %arg4: !pto.ptr, %arg5: !pto.ptr, %arg6: !pto.ptr, %arg7: !pto.ptr, %arg8: !pto.ptr, %arg9: !pto.ptr, %arg10: !pto.ptr, %arg11: !pto.ptr, %arg12: !pto.ptr, %arg13: !pto.ptr, %arg14: !pto.ptr, %arg15: !pto.ptr, %arg16: !pto.ptr, %arg17: !pto.ptr, %arg18: !pto.ptr, %arg19: !pto.ptr, %arg20: !pto.ptr, %arg21: !pto.ptr, %arg22: !pto.ptr, %arg23: !pto.ptr, %arg24: !pto.ptr, %arg25: !pto.ptr, %arg26: !pto.ptr) attributes {pto.kernel} { + // merged from add_kernel_2d via LaunchAdd_kernel_2d + + %c0_m0 = arith.constant 0 : index + %c1_m0 = arith.constant 1 : index + %c64_m0 = arith.constant 64 : index + %c1024_m0 = arith.constant 1024 : index + %c0_i64_m0 = arith.constant 0 : i64 + %c1_i64_m0 = arith.constant 1 : i64 + %c2_i64_m0 = arith.constant 2 : i64 + %c32_i64_m0 = arith.constant 32 : i64 + %c128_i64_m0 = arith.constant 128 : i64 + %c4096_i64_m0 = arith.constant 4096 : i64 + %c8192_i64_m0 = arith.constant 8192 : i64 + %c1024_i32_m0 = arith.constant 1024 : i32 + + %ub_lhs_m0 = pto.castptr %c0_i64_m0 : i64 -> !pto.ptr + %ub_rhs_m0 = pto.castptr %c4096_i64_m0 : i64 -> !pto.ptr + %ub_out_m0 = pto.castptr %c8192_i64_m0 : i64 -> !pto.ptr + + %false_m0 = arith.constant false pto.get_buf "PIPE_MTE2", 0, 0 - pto.mte_gm_ub %arg0, %ub_lhs, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) + pto.mte_gm_ub %arg0, %ub_lhs_m0, %c0_i64_m0, %c128_i64_m0 + nburst(%c32_i64_m0, %c128_i64_m0, %c128_i64_m0) : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 pto.rls_buf "PIPE_MTE2", 0, 0 pto.get_buf "PIPE_MTE2", 1, 0 - pto.mte_gm_ub %arg1, %ub_rhs, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) + pto.mte_gm_ub %arg1, %ub_rhs_m0, %c0_i64_m0, %c128_i64_m0 + nburst(%c32_i64_m0, %c128_i64_m0, %c128_i64_m0) : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 pto.rls_buf "PIPE_MTE2", 1, 0 pto.get_buf "PIPE_V", 0, 0 @@ -34,13 +47,13 @@ module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind (i32) { - %mask, %next_remaining = pto.plt_b32 %remaining : i32 -> !pto.mask, i32 - %lhs = pto.vlds %ub_lhs[%offset] : !pto.ptr -> !pto.vreg<64xf32> - %rhs = pto.vlds %ub_rhs[%offset] : !pto.ptr -> !pto.vreg<64xf32> - %sum = pto.vadd %lhs, %rhs, %mask : !pto.vreg<64xf32>, !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> - pto.vsts %sum, %ub_out[%offset], %mask : !pto.vreg<64xf32>, !pto.ptr, !pto.mask - scf.yield %next_remaining : i32 + %__m0:1 = scf.for %offset_m0 = %c0_m0 to %c1024_m0 step %c64_m0 iter_args(%remaining_m0 = %c1024_i32_m0) -> (i32) { + %mask_m0, %next_remaining_m0 = pto.plt_b32 %remaining_m0 : i32 -> !pto.mask, i32 + %lhs_m0 = pto.vlds %ub_lhs_m0[%offset_m0] : !pto.ptr -> !pto.vreg<64xf32> + %rhs_m0 = pto.vlds %ub_rhs_m0[%offset_m0] : !pto.ptr -> !pto.vreg<64xf32> + %sum_m0 = pto.vadd %lhs_m0, %rhs_m0, %mask_m0 : !pto.vreg<64xf32>, !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> + pto.vsts %sum_m0, %ub_out_m0[%offset_m0], %mask_m0 : !pto.vreg<64xf32>, !pto.ptr, !pto.mask + scf.yield %next_remaining_m0 : i32 } } pto.rls_buf "PIPE_V", 0, 0 @@ -48,11 +61,372 @@ module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind, !pto.ptr, i64, i64, i64, i64 pto.rls_buf "PIPE_MTE3", 2, 0 pto.barrier #pto.pipe + + // merged from vadd_f16_kernel via LaunchVadd_f16_kernel + + %c0_m1 = arith.constant 0 : index + %c128_m1 = arith.constant 128 : index + %c1024_m1 = arith.constant 1024 : index + %c0_i64_m1 = arith.constant 0 : i64 + %c1_i64_m1 = arith.constant 1 : i64 + %c32_i64_m1 = arith.constant 32 : i64 + %c64_i64_m1 = arith.constant 64 : i64 + %c2048_i64_m1 = arith.constant 2048 : i64 + %c4096_i64_m1 = arith.constant 4096 : i64 + + %ub_lhs_m1 = pto.castptr %c0_i64_m1 : i64 -> !pto.ptr + %ub_rhs_m1 = pto.castptr %c2048_i64_m1 : i64 -> !pto.ptr + %ub_out_m1 = pto.castptr %c4096_i64_m1 : i64 -> !pto.ptr + + %false_m1 = arith.constant false + pto.mte_gm_ub %arg3, %ub_lhs_m1, %c0_i64_m1, %c64_i64_m1 + nburst(%c32_i64_m1, %c64_i64_m1, %c64_i64_m1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + pto.mte_gm_ub %arg4, %ub_rhs_m1, %c0_i64_m1, %c64_i64_m1 + nburst(%c32_i64_m1, %c64_i64_m1, %c64_i64_m1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %mask_m1 = pto.pset_b16 "PAT_ALL" : !pto.mask + scf.for %offset_m1 = %c0_m1 to %c1024_m1 step %c128_m1 { + %lhs_m1 = pto.vlds %ub_lhs_m1[%offset_m1] : !pto.ptr -> !pto.vreg<128xf16> + %rhs_m1 = pto.vlds %ub_rhs_m1[%offset_m1] : !pto.ptr -> !pto.vreg<128xf16> + %sum_m1 = pto.vadd %lhs_m1, %rhs_m1, %mask_m1 : !pto.vreg<128xf16>, !pto.vreg<128xf16>, !pto.mask -> !pto.vreg<128xf16> + pto.vsts %sum_m1, %ub_out_m1[%offset_m1], %mask_m1 : !pto.vreg<128xf16>, !pto.ptr, !pto.mask + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m1, %arg5, %c64_i64_m1 + nburst(%c32_i64_m1, %c64_i64_m1, %c64_i64_m1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + + // merged from vadd_bf16_kernel via LaunchVadd_bf16_kernel + + %c0_m2 = arith.constant 0 : index + %c128_m2 = arith.constant 128 : index + %c1024_m2 = arith.constant 1024 : index + %c0_i64_m2 = arith.constant 0 : i64 + %c1_i64_m2 = arith.constant 1 : i64 + %c32_i64_m2 = arith.constant 32 : i64 + %c64_i64_m2 = arith.constant 64 : i64 + %c2048_i64_m2 = arith.constant 2048 : i64 + %c4096_i64_m2 = arith.constant 4096 : i64 + + %ub_lhs_m2 = pto.castptr %c0_i64_m2 : i64 -> !pto.ptr + %ub_rhs_m2 = pto.castptr %c2048_i64_m2 : i64 -> !pto.ptr + %ub_out_m2 = pto.castptr %c4096_i64_m2 : i64 -> !pto.ptr + + %false_m2 = arith.constant false + pto.mte_gm_ub %arg6, %ub_lhs_m2, %c0_i64_m2, %c64_i64_m2 + nburst(%c32_i64_m2, %c64_i64_m2, %c64_i64_m2) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + pto.mte_gm_ub %arg7, %ub_rhs_m2, %c0_i64_m2, %c64_i64_m2 + nburst(%c32_i64_m2, %c64_i64_m2, %c64_i64_m2) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %mask_m2 = pto.pset_b16 "PAT_ALL" : !pto.mask + scf.for %offset_m2 = %c0_m2 to %c1024_m2 step %c128_m2 { + %lhs_m2 = pto.vlds %ub_lhs_m2[%offset_m2] : !pto.ptr -> !pto.vreg<128xbf16> + %rhs_m2 = pto.vlds %ub_rhs_m2[%offset_m2] : !pto.ptr -> !pto.vreg<128xbf16> + %sum_m2 = pto.vadd %lhs_m2, %rhs_m2, %mask_m2 : !pto.vreg<128xbf16>, !pto.vreg<128xbf16>, !pto.mask -> !pto.vreg<128xbf16> + pto.vsts %sum_m2, %ub_out_m2[%offset_m2], %mask_m2 : !pto.vreg<128xbf16>, !pto.ptr, !pto.mask + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m2, %arg8, %c64_i64_m2 + nburst(%c32_i64_m2, %c64_i64_m2, %c64_i64_m2) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + + // merged from vadd_f32_exceptional_kernel_2d via LaunchVadd_f32_exceptional_kernel_2d + + %c0_m3 = arith.constant 0 : index + %c64_m3 = arith.constant 64 : index + %c1024_m3 = arith.constant 1024 : index + %c0_i64_m3 = arith.constant 0 : i64 + %c1_i64_m3 = arith.constant 1 : i64 + %c32_i64_m3 = arith.constant 32 : i64 + %c128_i64_m3 = arith.constant 128 : i64 + %c4096_i64_m3 = arith.constant 4096 : i64 + %c8192_i64_m3 = arith.constant 8192 : i64 + %c1024_i32_m3 = arith.constant 1024 : i32 + + %ub_lhs_m3 = pto.castptr %c0_i64_m3 : i64 -> !pto.ptr + %ub_rhs_m3 = pto.castptr %c4096_i64_m3 : i64 -> !pto.ptr + %ub_out_m3 = pto.castptr %c8192_i64_m3 : i64 -> !pto.ptr + + %false_m3 = arith.constant false + pto.mte_gm_ub %arg9, %ub_lhs_m3, %c0_i64_m3, %c128_i64_m3 + nburst(%c32_i64_m3, %c128_i64_m3, %c128_i64_m3) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + pto.mte_gm_ub %arg10, %ub_rhs_m3, %c0_i64_m3, %c128_i64_m3 + nburst(%c32_i64_m3, %c128_i64_m3, %c128_i64_m3) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %__m3:1 = scf.for %offset_m3 = %c0_m3 to %c1024_m3 step %c64_m3 iter_args(%remaining_m3 = %c1024_i32_m3) -> (i32) { + %mask_m3, %next_remaining_m3 = pto.plt_b32 %remaining_m3 : i32 -> !pto.mask, i32 + %lhs_m3 = pto.vlds %ub_lhs_m3[%offset_m3] : !pto.ptr -> !pto.vreg<64xf32> + %rhs_m3 = pto.vlds %ub_rhs_m3[%offset_m3] : !pto.ptr -> !pto.vreg<64xf32> + %sum_m3 = pto.vadd %lhs_m3, %rhs_m3, %mask_m3 : !pto.vreg<64xf32>, !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> + pto.vsts %sum_m3, %ub_out_m3[%offset_m3], %mask_m3 : !pto.vreg<64xf32>, !pto.ptr, !pto.mask + scf.yield %next_remaining_m3 : i32 + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m3, %arg11, %c128_i64_m3 + nburst(%c32_i64_m3, %c128_i64_m3, %c128_i64_m3) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + + // merged from vadd_i16_signed_kernel via LaunchVadd_i16_signed_kernel + + %c0_m4 = arith.constant 0 : index + %c1024_m4 = arith.constant 1024 : index + %c128_m4 = arith.constant 128 : index + %c0_i64_m4 = arith.constant 0 : i64 + %c1_i64_m4 = arith.constant 1 : i64 + %c32_i64_m4 = arith.constant 32 : i64 + %c64_i64_m4 = arith.constant 64 : i64 + %c2048_i64_m4 = arith.constant 2048 : i64 + %c4096_i64_m4 = arith.constant 4096 : i64 + + %ub_lhs_m4 = pto.castptr %c0_i64_m4 : i64 -> !pto.ptr + %ub_rhs_m4 = pto.castptr %c2048_i64_m4 : i64 -> !pto.ptr + %ub_out_m4 = pto.castptr %c4096_i64_m4 : i64 -> !pto.ptr + + %false_m4 = arith.constant false + pto.mte_gm_ub %arg12, %ub_lhs_m4, %c0_i64_m4, %c64_i64_m4 + nburst(%c32_i64_m4, %c64_i64_m4, %c64_i64_m4) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + pto.mte_gm_ub %arg13, %ub_rhs_m4, %c0_i64_m4, %c64_i64_m4 + nburst(%c32_i64_m4, %c64_i64_m4, %c64_i64_m4) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %mask_m4 = pto.pset_b16 "PAT_ALL" : !pto.mask + scf.for %offset_m4 = %c0_m4 to %c1024_m4 step %c128_m4 { + %lhs_m4 = pto.vlds %ub_lhs_m4[%offset_m4] : !pto.ptr -> !pto.vreg<128xi16> + %rhs_m4 = pto.vlds %ub_rhs_m4[%offset_m4] : !pto.ptr -> !pto.vreg<128xi16> + %sum_m4 = pto.vadd %lhs_m4, %rhs_m4, %mask_m4 : !pto.vreg<128xi16>, !pto.vreg<128xi16>, !pto.mask -> !pto.vreg<128xi16> + pto.vsts %sum_m4, %ub_out_m4[%offset_m4], %mask_m4 : !pto.vreg<128xi16>, !pto.ptr, !pto.mask + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m4, %arg14, %c64_i64_m4 + nburst(%c32_i64_m4, %c64_i64_m4, %c64_i64_m4) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + + // merged from vadd_i16_signed_overflow_kernel via LaunchVadd_i16_signed_overflow_kernel + + %c0_m5 = arith.constant 0 : index + %c128_m5 = arith.constant 128 : index + %c1024_m5 = arith.constant 1024 : index + %c0_i64_m5 = arith.constant 0 : i64 + %c1_i64_m5 = arith.constant 1 : i64 + %c32_i64_m5 = arith.constant 32 : i64 + %c64_i64_m5 = arith.constant 64 : i64 + %c2048_i64_m5 = arith.constant 2048 : i64 + %c4096_i64_m5 = arith.constant 4096 : i64 + %c1024_i32_m5 = arith.constant 1024 : i32 + + %ub_lhs_m5 = pto.castptr %c0_i64_m5 : i64 -> !pto.ptr + %ub_rhs_m5 = pto.castptr %c2048_i64_m5 : i64 -> !pto.ptr + %ub_out_m5 = pto.castptr %c4096_i64_m5 : i64 -> !pto.ptr + + %false_m5 = arith.constant false + pto.mte_gm_ub %arg15, %ub_lhs_m5, %c0_i64_m5, %c64_i64_m5 + nburst(%c32_i64_m5, %c64_i64_m5, %c64_i64_m5) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + pto.mte_gm_ub %arg16, %ub_rhs_m5, %c0_i64_m5, %c64_i64_m5 + nburst(%c32_i64_m5, %c64_i64_m5, %c64_i64_m5) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %__m5:1 = scf.for %offset_m5 = %c0_m5 to %c1024_m5 step %c128_m5 iter_args(%remaining_m5 = %c1024_i32_m5) -> (i32) { + %mask_m5, %next_remaining_m5 = pto.plt_b16 %remaining_m5 : i32 -> !pto.mask, i32 + %lhs_m5 = pto.vlds %ub_lhs_m5[%offset_m5] : !pto.ptr -> !pto.vreg<128xi16> + %rhs_m5 = pto.vlds %ub_rhs_m5[%offset_m5] : !pto.ptr -> !pto.vreg<128xi16> + %sum_m5 = pto.vadd %lhs_m5, %rhs_m5, %mask_m5 : !pto.vreg<128xi16>, !pto.vreg<128xi16>, !pto.mask -> !pto.vreg<128xi16> + pto.vsts %sum_m5, %ub_out_m5[%offset_m5], %mask_m5 : !pto.vreg<128xi16>, !pto.ptr, !pto.mask + scf.yield %next_remaining_m5 : i32 + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m5, %arg17, %c64_i64_m5 + nburst(%c32_i64_m5, %c64_i64_m5, %c64_i64_m5) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + + // merged from vadd_i16_unsigned_kernel via LaunchVadd_i16_unsigned_kernel + + %c0_m6 = arith.constant 0 : index + %c1024_m6 = arith.constant 1024 : index + %c128_m6 = arith.constant 128 : index + %c0_i64_m6 = arith.constant 0 : i64 + %c1_i64_m6 = arith.constant 1 : i64 + %c32_i64_m6 = arith.constant 32 : i64 + %c64_i64_m6 = arith.constant 64 : i64 + %c2048_i64_m6 = arith.constant 2048 : i64 + %c4096_i64_m6 = arith.constant 4096 : i64 + + %ub_lhs_m6 = pto.castptr %c0_i64_m6 : i64 -> !pto.ptr + %ub_rhs_m6 = pto.castptr %c2048_i64_m6 : i64 -> !pto.ptr + %ub_out_m6 = pto.castptr %c4096_i64_m6 : i64 -> !pto.ptr + + %false_m6 = arith.constant false + pto.mte_gm_ub %arg18, %ub_lhs_m6, %c0_i64_m6, %c64_i64_m6 + nburst(%c32_i64_m6, %c64_i64_m6, %c64_i64_m6) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + pto.mte_gm_ub %arg19, %ub_rhs_m6, %c0_i64_m6, %c64_i64_m6 + nburst(%c32_i64_m6, %c64_i64_m6, %c64_i64_m6) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %mask_m6 = pto.pset_b16 "PAT_ALL" : !pto.mask + scf.for %offset_m6 = %c0_m6 to %c1024_m6 step %c128_m6 { + %lhs_m6 = pto.vlds %ub_lhs_m6[%offset_m6] : !pto.ptr -> !pto.vreg<128xui16> + %rhs_m6 = pto.vlds %ub_rhs_m6[%offset_m6] : !pto.ptr -> !pto.vreg<128xui16> + %sum_m6 = pto.vadd %lhs_m6, %rhs_m6, %mask_m6 : !pto.vreg<128xui16>, !pto.vreg<128xui16>, !pto.mask -> !pto.vreg<128xui16> + pto.vsts %sum_m6, %ub_out_m6[%offset_m6], %mask_m6 : !pto.vreg<128xui16>, !pto.ptr, !pto.mask + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m6, %arg20, %c64_i64_m6 + nburst(%c32_i64_m6, %c64_i64_m6, %c64_i64_m6) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + + // merged from vadd_i16_unsigned_overflow_kernel via LaunchVadd_i16_unsigned_overflow_kernel + + %c0_m7 = arith.constant 0 : index + %c128_m7 = arith.constant 128 : index + %c1024_m7 = arith.constant 1024 : index + %c0_i64_m7 = arith.constant 0 : i64 + %c1_i64_m7 = arith.constant 1 : i64 + %c32_i64_m7 = arith.constant 32 : i64 + %c64_i64_m7 = arith.constant 64 : i64 + %c2048_i64_m7 = arith.constant 2048 : i64 + %c4096_i64_m7 = arith.constant 4096 : i64 + %c1024_i32_m7 = arith.constant 1024 : i32 + + %ub_lhs_m7 = pto.castptr %c0_i64_m7 : i64 -> !pto.ptr + %ub_rhs_m7 = pto.castptr %c2048_i64_m7 : i64 -> !pto.ptr + %ub_out_m7 = pto.castptr %c4096_i64_m7 : i64 -> !pto.ptr + + %false_m7 = arith.constant false + pto.mte_gm_ub %arg21, %ub_lhs_m7, %c0_i64_m7, %c64_i64_m7 + nburst(%c32_i64_m7, %c64_i64_m7, %c64_i64_m7) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + pto.mte_gm_ub %arg22, %ub_rhs_m7, %c0_i64_m7, %c64_i64_m7 + nburst(%c32_i64_m7, %c64_i64_m7, %c64_i64_m7) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %__m7:1 = scf.for %offset_m7 = %c0_m7 to %c1024_m7 step %c128_m7 iter_args(%remaining_m7 = %c1024_i32_m7) -> (i32) { + %mask_m7, %next_remaining_m7 = pto.plt_b16 %remaining_m7 : i32 -> !pto.mask, i32 + %lhs_m7 = pto.vlds %ub_lhs_m7[%offset_m7] : !pto.ptr -> !pto.vreg<128xui16> + %rhs_m7 = pto.vlds %ub_rhs_m7[%offset_m7] : !pto.ptr -> !pto.vreg<128xui16> + %sum_m7 = pto.vadd %lhs_m7, %rhs_m7, %mask_m7 : !pto.vreg<128xui16>, !pto.vreg<128xui16>, !pto.mask -> !pto.vreg<128xui16> + pto.vsts %sum_m7, %ub_out_m7[%offset_m7], %mask_m7 : !pto.vreg<128xui16>, !pto.ptr, !pto.mask + scf.yield %next_remaining_m7 : i32 + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m7, %arg23, %c64_i64_m7 + nburst(%c32_i64_m7, %c64_i64_m7, %c64_i64_m7) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + + // merged from vadd_tail_kernel_2d via LaunchVadd_tail_kernel_2d + + %c0_m8 = arith.constant 0 : index + %c64_m8 = arith.constant 64 : index + %c1024_m8 = arith.constant 1024 : index + %c0_i64_m8 = arith.constant 0 : i64 + %c1_i64_m8 = arith.constant 1 : i64 + %c32_i64_m8 = arith.constant 32 : i64 + %c128_i64_m8 = arith.constant 128 : i64 + %c4096_i64_m8 = arith.constant 4096 : i64 + %c8192_i64_m8 = arith.constant 8192 : i64 + %c1000_i32_m8 = arith.constant 1000 : i32 + + %ub_lhs_m8 = pto.castptr %c0_i64_m8 : i64 -> !pto.ptr + %ub_rhs_m8 = pto.castptr %c4096_i64_m8 : i64 -> !pto.ptr + %ub_out_m8 = pto.castptr %c8192_i64_m8 : i64 -> !pto.ptr + + %false_m8 = arith.constant false + pto.mte_gm_ub %arg24, %ub_lhs_m8, %c0_i64_m8, %c128_i64_m8 + nburst(%c32_i64_m8, %c128_i64_m8, %c128_i64_m8) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + pto.mte_gm_ub %arg25, %ub_rhs_m8, %c0_i64_m8, %c128_i64_m8 + nburst(%c32_i64_m8, %c128_i64_m8, %c128_i64_m8) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %__m8:1 = scf.for %offset_m8 = %c0_m8 to %c1024_m8 step %c64_m8 iter_args(%remaining_m8 = %c1000_i32_m8) -> (i32) { + %mask_m8, %next_remaining_m8 = pto.plt_b32 %remaining_m8 : i32 -> !pto.mask, i32 + %lhs_m8 = pto.vlds %ub_lhs_m8[%offset_m8] : !pto.ptr -> !pto.vreg<64xf32> + %rhs_m8 = pto.vlds %ub_rhs_m8[%offset_m8] : !pto.ptr -> !pto.vreg<64xf32> + %sum_m8 = pto.vadd %lhs_m8, %rhs_m8, %mask_m8 : !pto.vreg<64xf32>, !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> + pto.vsts %sum_m8, %ub_out_m8[%offset_m8], %mask_m8 : !pto.vreg<64xf32>, !pto.ptr, !pto.mask + scf.yield %next_remaining_m8 : i32 + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m8, %arg26, %c128_i64_m8 + nburst(%c32_i64_m8, %c128_i64_m8, %c128_i64_m8) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + return } } diff --git a/test/vpto/cases/micro-op/binary-vector/vadd/launch.cpp b/test/vpto/cases/micro-op/binary-vector/vadd/launch.cpp index 7e1cfc9e0..02c7b1703 100644 --- a/test/vpto/cases/micro-op/binary-vector/vadd/launch.cpp +++ b/test/vpto/cases/micro-op/binary-vector/vadd/launch.cpp @@ -6,6 +6,8 @@ // INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. // See LICENSE in the root of the software repository for the full text of the License. +// Merged vadd launch wrappers: 9 variants (f32, f16, bf16, f32-exceptional, +// i16-signed, i16-signed-overflow, i16-unsigned, i16-unsigned-overflow, tail) #ifndef __VEC_SCOPE__ #define __VEC_SCOPE__ #endif @@ -35,12 +37,67 @@ struct MrgSortExecutedNumList { #include "acl/acl.h" #endif +// --- f32 --- extern "C" __global__ [aicore] void add_kernel_2d(__gm__ float *v1, __gm__ float *v2, __gm__ float *v3); -void LaunchAdd_kernel_2d(float *v1, float *v2, float *v3, void *stream) { - add_kernel_2d<<<1, nullptr, stream>>>((__gm__ float *)v1, - (__gm__ float *)v2, - (__gm__ float *)v3); +extern "C" __global__ [aicore] void vadd_deep_merged_kernel( + __gm__ float * arg0, + __gm__ float * arg1, + __gm__ float * arg2, + __gm__ half * arg3, + __gm__ half * arg4, + __gm__ half * arg5, + __gm__ bfloat16_t * arg6, + __gm__ bfloat16_t * arg7, + __gm__ bfloat16_t * arg8, + __gm__ float * arg9, + __gm__ float * arg10, + __gm__ float * arg11, + __gm__ int16_t * arg12, + __gm__ int16_t * arg13, + __gm__ int16_t * arg14, + __gm__ int16_t * arg15, + __gm__ int16_t * arg16, + __gm__ int16_t * arg17, + __gm__ uint16_t * arg18, + __gm__ uint16_t * arg19, + __gm__ uint16_t * arg20, + __gm__ uint16_t * arg21, + __gm__ uint16_t * arg22, + __gm__ uint16_t * arg23, + __gm__ float * arg24, + __gm__ float * arg25, + __gm__ float * arg26); + +void LaunchVaddDeepMerged(float * p0, float * p1, float * p2, uint16_t * p3, uint16_t * p4, uint16_t * p5, uint16_t * p6, uint16_t * p7, uint16_t * p8, float * p9, float * p10, float * p11, int16_t * p12, int16_t * p13, int16_t * p14, int16_t * p15, int16_t * p16, int16_t * p17, uint16_t * p18, uint16_t * p19, uint16_t * p20, uint16_t * p21, uint16_t * p22, uint16_t * p23, float * p24, float * p25, float * p26, void *stream) { + vadd_deep_merged_kernel<<<1, nullptr, stream>>>( + (__gm__ float *)p0, + (__gm__ float *)p1, + (__gm__ float *)p2, + (__gm__ half *)p3, + (__gm__ half *)p4, + (__gm__ half *)p5, + (__gm__ bfloat16_t *)p6, + (__gm__ bfloat16_t *)p7, + (__gm__ bfloat16_t *)p8, + (__gm__ float *)p9, + (__gm__ float *)p10, + (__gm__ float *)p11, + (__gm__ int16_t *)p12, + (__gm__ int16_t *)p13, + (__gm__ int16_t *)p14, + (__gm__ int16_t *)p15, + (__gm__ int16_t *)p16, + (__gm__ int16_t *)p17, + (__gm__ uint16_t *)p18, + (__gm__ uint16_t *)p19, + (__gm__ uint16_t *)p20, + (__gm__ uint16_t *)p21, + (__gm__ uint16_t *)p22, + (__gm__ uint16_t *)p23, + (__gm__ float *)p24, + (__gm__ float *)p25, + (__gm__ float *)p26); } diff --git a/test/vpto/cases/micro-op/binary-vector/vadd/main.cpp b/test/vpto/cases/micro-op/binary-vector/vadd/main.cpp index 78517d1a3..6571fa75b 100644 --- a/test/vpto/cases/micro-op/binary-vector/vadd/main.cpp +++ b/test/vpto/cases/micro-op/binary-vector/vadd/main.cpp @@ -6,14 +6,14 @@ // INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. // See LICENSE in the root of the software repository for the full text of the License. -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -*/ +// Merged vadd host runner: 9 variants run sequentially. +// Each variant uses uniquely suffixed .bin files to avoid collisions. #include "test_common.h" #include "acl/acl.h" #include #include +#include using namespace PtoTestCommon; @@ -28,27 +28,73 @@ using namespace PtoTestCommon; } \ } while (0) -void LaunchAdd_kernel_2d(float *v1, float *v2, float *v3, void *stream); +#define FILE_CHECK(expr, path) \ + do { \ + if (!(expr)) { \ + std::fprintf(stderr, "[ERROR] file operation failed: %s (%s:%d)\n", \ + path, __FILE__, __LINE__); \ + rc = 1; \ + goto cleanup; \ + } \ + } while (0) + +// Launch wrappers +void LaunchVaddDeepMerged(float * p0, float * p1, float * p2, uint16_t * p3, uint16_t * p4, uint16_t * p5, uint16_t * p6, uint16_t * p7, uint16_t * p8, float * p9, float * p10, float * p11, int16_t * p12, int16_t * p13, int16_t * p14, int16_t * p15, int16_t * p16, int16_t * p17, uint16_t * p18, uint16_t * p19, uint16_t * p20, uint16_t * p21, uint16_t * p22, uint16_t * p23, float * p24, float * p25, float * p26, void *stream); int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(float); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(float); - size_t elemCount_v3 = 1024; - size_t fileSize_v3 = elemCount_v3 * sizeof(float); - float *v1Host = nullptr; - float *v1Device = nullptr; - float *v2Host = nullptr; - float *v2Device = nullptr; - float *v3Host = nullptr; - float *v3Device = nullptr; + // ----- sizes (element counts * sizeof per variant) ----- + constexpr size_t ELEM_F32 = 1024; + constexpr size_t ELEM_F16 = 1024; + constexpr size_t ELEM_BF16 = 1024; + constexpr size_t ELEM_X = 1024; + constexpr size_t ELEM_I16S = 1024; + constexpr size_t ELEM_I16U = 1024; + constexpr size_t ELEM_TAIL = 1024; + + constexpr size_t SZ_F32 = ELEM_F32 * sizeof(float); + constexpr size_t SZ_F16 = ELEM_F16 * sizeof(uint16_t); + constexpr size_t SZ_BF16 = ELEM_BF16 * sizeof(uint16_t); + constexpr size_t SZ_X = ELEM_X * sizeof(float); + constexpr size_t SZ_I16S = ELEM_I16S * sizeof(int16_t); + constexpr size_t SZ_I16U = ELEM_I16U * sizeof(uint16_t); + constexpr size_t SZ_TAIL = ELEM_TAIL * sizeof(float); + + // ----- host/device pointers (3 buffers per variant) ----- + float *h_f32_v1 = nullptr, *h_f32_v2 = nullptr, *h_f32_v3 = nullptr; + float *d_f32_v1 = nullptr, *d_f32_v2 = nullptr, *d_f32_v3 = nullptr; + + uint16_t *h_f16_v1 = nullptr, *h_f16_v2 = nullptr, *h_f16_v3 = nullptr; + uint16_t *d_f16_v1 = nullptr, *d_f16_v2 = nullptr, *d_f16_v3 = nullptr; + + uint16_t *h_bf16_v1 = nullptr, *h_bf16_v2 = nullptr, *h_bf16_v3 = nullptr; + uint16_t *d_bf16_v1 = nullptr, *d_bf16_v2 = nullptr, *d_bf16_v3 = nullptr; + + float *h_x_v1 = nullptr, *h_x_v2 = nullptr, *h_x_v3 = nullptr; + float *d_x_v1 = nullptr, *d_x_v2 = nullptr, *d_x_v3 = nullptr; + + int16_t *h_i16s_v1 = nullptr, *h_i16s_v2 = nullptr, *h_i16s_v3 = nullptr; + int16_t *d_i16s_v1 = nullptr, *d_i16s_v2 = nullptr, *d_i16s_v3 = nullptr; + + int16_t *h_i16so_v1 = nullptr, *h_i16so_v2 = nullptr, *h_i16so_v3 = nullptr; + int16_t *d_i16so_v1 = nullptr, *d_i16so_v2 = nullptr, *d_i16so_v3 = nullptr; + + uint16_t *h_i16u_v1 = nullptr, *h_i16u_v2 = nullptr, *h_i16u_v3 = nullptr; + uint16_t *d_i16u_v1 = nullptr, *d_i16u_v2 = nullptr, *d_i16u_v3 = nullptr; + + uint16_t *h_i16uo_v1 = nullptr, *h_i16uo_v2 = nullptr, *h_i16uo_v3 = nullptr; + uint16_t *d_i16uo_v1 = nullptr, *d_i16uo_v2 = nullptr, *d_i16uo_v3 = nullptr; + + float *h_tail_v1 = nullptr, *h_tail_v2 = nullptr, *h_tail_v3 = nullptr; + float *d_tail_v1 = nullptr, *d_tail_v2 = nullptr, *d_tail_v3 = nullptr; + int rc = 0; bool aclInited = false; bool deviceSet = false; int deviceId = 0; aclrtStream stream = nullptr; + size_t fsize = 0; + // ----- init ACL ----- ACL_CHECK(aclInit(nullptr)); aclInited = true; if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) @@ -56,34 +102,205 @@ int main() { ACL_CHECK(aclrtSetDevice(deviceId)); deviceSet = true; ACL_CHECK(aclrtCreateStream(&stream)); - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMallocHost((void **)(&v3Host), fileSize_v3)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v3Device, fileSize_v3, ACL_MEM_MALLOC_HUGE_FIRST)); - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ReadFile("./v3.bin", fileSize_v3, v3Host, fileSize_v3); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v3Device, fileSize_v3, v3Host, fileSize_v3, - ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchAdd_kernel_2d(v1Device, v2Device, v3Device, stream); + + // ----- host malloc (all variants) ----- + ACL_CHECK(aclrtMallocHost((void **)&h_f32_v1, SZ_F32)); + ACL_CHECK(aclrtMallocHost((void **)&h_f32_v2, SZ_F32)); + ACL_CHECK(aclrtMallocHost((void **)&h_f32_v3, SZ_F32)); + ACL_CHECK(aclrtMallocHost((void **)&h_f16_v1, SZ_F16)); + ACL_CHECK(aclrtMallocHost((void **)&h_f16_v2, SZ_F16)); + ACL_CHECK(aclrtMallocHost((void **)&h_f16_v3, SZ_F16)); + ACL_CHECK(aclrtMallocHost((void **)&h_bf16_v1, SZ_BF16)); + ACL_CHECK(aclrtMallocHost((void **)&h_bf16_v2, SZ_BF16)); + ACL_CHECK(aclrtMallocHost((void **)&h_bf16_v3, SZ_BF16)); + ACL_CHECK(aclrtMallocHost((void **)&h_x_v1, SZ_X)); + ACL_CHECK(aclrtMallocHost((void **)&h_x_v2, SZ_X)); + ACL_CHECK(aclrtMallocHost((void **)&h_x_v3, SZ_X)); + ACL_CHECK(aclrtMallocHost((void **)&h_i16s_v1, SZ_I16S)); + ACL_CHECK(aclrtMallocHost((void **)&h_i16s_v2, SZ_I16S)); + ACL_CHECK(aclrtMallocHost((void **)&h_i16s_v3, SZ_I16S)); + ACL_CHECK(aclrtMallocHost((void **)&h_i16so_v1,SZ_I16S)); + ACL_CHECK(aclrtMallocHost((void **)&h_i16so_v2,SZ_I16S)); + ACL_CHECK(aclrtMallocHost((void **)&h_i16so_v3,SZ_I16S)); + ACL_CHECK(aclrtMallocHost((void **)&h_i16u_v1, SZ_I16U)); + ACL_CHECK(aclrtMallocHost((void **)&h_i16u_v2, SZ_I16U)); + ACL_CHECK(aclrtMallocHost((void **)&h_i16u_v3, SZ_I16U)); + ACL_CHECK(aclrtMallocHost((void **)&h_i16uo_v1,SZ_I16U)); + ACL_CHECK(aclrtMallocHost((void **)&h_i16uo_v2,SZ_I16U)); + ACL_CHECK(aclrtMallocHost((void **)&h_i16uo_v3,SZ_I16U)); + ACL_CHECK(aclrtMallocHost((void **)&h_tail_v1, SZ_TAIL)); + ACL_CHECK(aclrtMallocHost((void **)&h_tail_v2, SZ_TAIL)); + ACL_CHECK(aclrtMallocHost((void **)&h_tail_v3, SZ_TAIL)); + + // ----- device malloc (all variants) ----- + ACL_CHECK(aclrtMalloc((void **)&d_f32_v1, SZ_F32, ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc((void **)&d_f32_v2, SZ_F32, ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc((void **)&d_f32_v3, SZ_F32, ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc((void **)&d_f16_v1, SZ_F16, ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc((void **)&d_f16_v2, SZ_F16, ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc((void **)&d_f16_v3, SZ_F16, ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc((void **)&d_bf16_v1, SZ_BF16, ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc((void **)&d_bf16_v2, SZ_BF16, ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc((void **)&d_bf16_v3, SZ_BF16, ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc((void **)&d_x_v1, SZ_X, ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc((void **)&d_x_v2, SZ_X, ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc((void **)&d_x_v3, SZ_X, ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc((void **)&d_i16s_v1, SZ_I16S, ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc((void **)&d_i16s_v2, SZ_I16S, ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc((void **)&d_i16s_v3, SZ_I16S, ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc((void **)&d_i16so_v1,SZ_I16S, ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc((void **)&d_i16so_v2,SZ_I16S, ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc((void **)&d_i16so_v3,SZ_I16S, ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc((void **)&d_i16u_v1, SZ_I16U, ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc((void **)&d_i16u_v2, SZ_I16U, ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc((void **)&d_i16u_v3, SZ_I16U, ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc((void **)&d_i16uo_v1,SZ_I16U, ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc((void **)&d_i16uo_v2,SZ_I16U, ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc((void **)&d_i16uo_v3,SZ_I16U, ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc((void **)&d_tail_v1, SZ_TAIL, ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc((void **)&d_tail_v2, SZ_TAIL, ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc((void **)&d_tail_v3, SZ_TAIL, ACL_MEM_MALLOC_HUGE_FIRST)); + + // ----- read inputs (all variants) ----- + fsize = SZ_F32; FILE_CHECK(ReadFile("./v1.bin", fsize, h_f32_v1, SZ_F32) && fsize == SZ_F32, "./v1.bin"); + fsize = SZ_F32; FILE_CHECK(ReadFile("./v2.bin", fsize, h_f32_v2, SZ_F32) && fsize == SZ_F32, "./v2.bin"); + fsize = SZ_F32; FILE_CHECK(ReadFile("./v3.bin", fsize, h_f32_v3, SZ_F32) && fsize == SZ_F32, "./v3.bin"); + + fsize = SZ_F16; FILE_CHECK(ReadFile("./v1_f16.bin", fsize, h_f16_v1, SZ_F16) && fsize == SZ_F16, "./v1_f16.bin"); + fsize = SZ_F16; FILE_CHECK(ReadFile("./v2_f16.bin", fsize, h_f16_v2, SZ_F16) && fsize == SZ_F16, "./v2_f16.bin"); + fsize = SZ_F16; FILE_CHECK(ReadFile("./v3_f16.bin", fsize, h_f16_v3, SZ_F16) && fsize == SZ_F16, "./v3_f16.bin"); + + fsize = SZ_BF16; FILE_CHECK(ReadFile("./v1_bf16.bin", fsize, h_bf16_v1, SZ_BF16) && fsize == SZ_BF16, "./v1_bf16.bin"); + fsize = SZ_BF16; FILE_CHECK(ReadFile("./v2_bf16.bin", fsize, h_bf16_v2, SZ_BF16) && fsize == SZ_BF16, "./v2_bf16.bin"); + fsize = SZ_BF16; FILE_CHECK(ReadFile("./v3_bf16.bin", fsize, h_bf16_v3, SZ_BF16) && fsize == SZ_BF16, "./v3_bf16.bin"); + + fsize = SZ_X; FILE_CHECK(ReadFile("./v1_x.bin", fsize, h_x_v1, SZ_X) && fsize == SZ_X, "./v1_x.bin"); + fsize = SZ_X; FILE_CHECK(ReadFile("./v2_x.bin", fsize, h_x_v2, SZ_X) && fsize == SZ_X, "./v2_x.bin"); + fsize = SZ_X; FILE_CHECK(ReadFile("./v3_x.bin", fsize, h_x_v3, SZ_X) && fsize == SZ_X, "./v3_x.bin"); + + fsize = SZ_I16S; FILE_CHECK(ReadFile("./v1_i16s.bin", fsize, h_i16s_v1, SZ_I16S) && fsize == SZ_I16S, "./v1_i16s.bin"); + fsize = SZ_I16S; FILE_CHECK(ReadFile("./v2_i16s.bin", fsize, h_i16s_v2, SZ_I16S) && fsize == SZ_I16S, "./v2_i16s.bin"); + fsize = SZ_I16S; FILE_CHECK(ReadFile("./v3_i16s.bin", fsize, h_i16s_v3, SZ_I16S) && fsize == SZ_I16S, "./v3_i16s.bin"); + + fsize = SZ_I16S; FILE_CHECK(ReadFile("./v1_i16s_ov.bin", fsize, h_i16so_v1, SZ_I16S) && fsize == SZ_I16S, "./v1_i16s_ov.bin"); + fsize = SZ_I16S; FILE_CHECK(ReadFile("./v2_i16s_ov.bin", fsize, h_i16so_v2, SZ_I16S) && fsize == SZ_I16S, "./v2_i16s_ov.bin"); + fsize = SZ_I16S; FILE_CHECK(ReadFile("./v3_i16s_ov.bin", fsize, h_i16so_v3, SZ_I16S) && fsize == SZ_I16S, "./v3_i16s_ov.bin"); + + fsize = SZ_I16U; FILE_CHECK(ReadFile("./v1_i16u.bin", fsize, h_i16u_v1, SZ_I16U) && fsize == SZ_I16U, "./v1_i16u.bin"); + fsize = SZ_I16U; FILE_CHECK(ReadFile("./v2_i16u.bin", fsize, h_i16u_v2, SZ_I16U) && fsize == SZ_I16U, "./v2_i16u.bin"); + fsize = SZ_I16U; FILE_CHECK(ReadFile("./v3_i16u.bin", fsize, h_i16u_v3, SZ_I16U) && fsize == SZ_I16U, "./v3_i16u.bin"); + + fsize = SZ_I16U; FILE_CHECK(ReadFile("./v1_i16u_ov.bin", fsize, h_i16uo_v1, SZ_I16U) && fsize == SZ_I16U, "./v1_i16u_ov.bin"); + fsize = SZ_I16U; FILE_CHECK(ReadFile("./v2_i16u_ov.bin", fsize, h_i16uo_v2, SZ_I16U) && fsize == SZ_I16U, "./v2_i16u_ov.bin"); + fsize = SZ_I16U; FILE_CHECK(ReadFile("./v3_i16u_ov.bin", fsize, h_i16uo_v3, SZ_I16U) && fsize == SZ_I16U, "./v3_i16u_ov.bin"); + + fsize = SZ_TAIL; FILE_CHECK(ReadFile("./v1_tail.bin", fsize, h_tail_v1, SZ_TAIL) && fsize == SZ_TAIL, "./v1_tail.bin"); + fsize = SZ_TAIL; FILE_CHECK(ReadFile("./v2_tail.bin", fsize, h_tail_v2, SZ_TAIL) && fsize == SZ_TAIL, "./v2_tail.bin"); + fsize = SZ_TAIL; FILE_CHECK(ReadFile("./v3_tail.bin", fsize, h_tail_v3, SZ_TAIL) && fsize == SZ_TAIL, "./v3_tail.bin"); + + // ----- H2D copies (all variants) ----- + ACL_CHECK(aclrtMemcpy(d_f32_v1, SZ_F32, h_f32_v1, SZ_F32, ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(d_f32_v2, SZ_F32, h_f32_v2, SZ_F32, ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(d_f32_v3, SZ_F32, h_f32_v3, SZ_F32, ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(d_f16_v1, SZ_F16, h_f16_v1, SZ_F16, ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(d_f16_v2, SZ_F16, h_f16_v2, SZ_F16, ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(d_f16_v3, SZ_F16, h_f16_v3, SZ_F16, ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(d_bf16_v1, SZ_BF16, h_bf16_v1, SZ_BF16, ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(d_bf16_v2, SZ_BF16, h_bf16_v2, SZ_BF16, ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(d_bf16_v3, SZ_BF16, h_bf16_v3, SZ_BF16, ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(d_x_v1, SZ_X, h_x_v1, SZ_X, ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(d_x_v2, SZ_X, h_x_v2, SZ_X, ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(d_x_v3, SZ_X, h_x_v3, SZ_X, ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(d_i16s_v1, SZ_I16S, h_i16s_v1, SZ_I16S, ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(d_i16s_v2, SZ_I16S, h_i16s_v2, SZ_I16S, ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(d_i16s_v3, SZ_I16S, h_i16s_v3, SZ_I16S, ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(d_i16so_v1,SZ_I16S, h_i16so_v1,SZ_I16S, ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(d_i16so_v2,SZ_I16S, h_i16so_v2,SZ_I16S, ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(d_i16so_v3,SZ_I16S, h_i16so_v3,SZ_I16S, ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(d_i16u_v1, SZ_I16U, h_i16u_v1, SZ_I16U, ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(d_i16u_v2, SZ_I16U, h_i16u_v2, SZ_I16U, ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(d_i16u_v3, SZ_I16U, h_i16u_v3, SZ_I16U, ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(d_i16uo_v1,SZ_I16U, h_i16uo_v1,SZ_I16U, ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(d_i16uo_v2,SZ_I16U, h_i16uo_v2,SZ_I16U, ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(d_i16uo_v3,SZ_I16U, h_i16uo_v3,SZ_I16U, ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(d_tail_v1, SZ_TAIL, h_tail_v1, SZ_TAIL, ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(d_tail_v2, SZ_TAIL, h_tail_v2, SZ_TAIL, ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(d_tail_v3, SZ_TAIL, h_tail_v3, SZ_TAIL, ACL_MEMCPY_HOST_TO_DEVICE)); + + // ----- launch all 9 kernels ----- + LaunchVaddDeepMerged( + d_f32_v1, + d_f32_v2, + d_f32_v3, + d_f16_v1, + d_f16_v2, + d_f16_v3, + d_bf16_v1, + d_bf16_v2, + d_bf16_v3, + d_x_v1, + d_x_v2, + d_x_v3, + d_i16s_v1, + d_i16s_v2, + d_i16s_v3, + d_i16so_v1, + d_i16so_v2, + d_i16so_v3, + d_i16u_v1, + d_i16u_v2, + d_i16u_v3, + d_i16uo_v1, + d_i16uo_v2, + d_i16uo_v3, + d_tail_v1, + d_tail_v2, + d_tail_v3, + stream + ); ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v3Host, fileSize_v3, v3Device, fileSize_v3, - ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v3.bin", v3Host, fileSize_v3); + + // ----- D2H copies (outputs) ----- + ACL_CHECK(aclrtMemcpy(h_f32_v3, SZ_F32, d_f32_v3, SZ_F32, ACL_MEMCPY_DEVICE_TO_HOST)); + ACL_CHECK(aclrtMemcpy(h_f16_v3, SZ_F16, d_f16_v3, SZ_F16, ACL_MEMCPY_DEVICE_TO_HOST)); + ACL_CHECK(aclrtMemcpy(h_bf16_v3, SZ_BF16, d_bf16_v3, SZ_BF16, ACL_MEMCPY_DEVICE_TO_HOST)); + ACL_CHECK(aclrtMemcpy(h_x_v3, SZ_X, d_x_v3, SZ_X, ACL_MEMCPY_DEVICE_TO_HOST)); + ACL_CHECK(aclrtMemcpy(h_i16s_v3, SZ_I16S, d_i16s_v3, SZ_I16S, ACL_MEMCPY_DEVICE_TO_HOST)); + ACL_CHECK(aclrtMemcpy(h_i16so_v3,SZ_I16S, d_i16so_v3,SZ_I16S, ACL_MEMCPY_DEVICE_TO_HOST)); + ACL_CHECK(aclrtMemcpy(h_i16u_v3, SZ_I16U, d_i16u_v3, SZ_I16U, ACL_MEMCPY_DEVICE_TO_HOST)); + ACL_CHECK(aclrtMemcpy(h_i16uo_v3,SZ_I16U, d_i16uo_v3,SZ_I16U, ACL_MEMCPY_DEVICE_TO_HOST)); + ACL_CHECK(aclrtMemcpy(h_tail_v3, SZ_TAIL, d_tail_v3, SZ_TAIL, ACL_MEMCPY_DEVICE_TO_HOST)); + + // ----- write outputs ----- + FILE_CHECK(WriteFile("./v3.bin", h_f32_v3, SZ_F32), "./v3.bin"); + FILE_CHECK(WriteFile("./v3_f16.bin", h_f16_v3, SZ_F16), "./v3_f16.bin"); + FILE_CHECK(WriteFile("./v3_bf16.bin", h_bf16_v3, SZ_BF16), "./v3_bf16.bin"); + FILE_CHECK(WriteFile("./v3_x.bin", h_x_v3, SZ_X), "./v3_x.bin"); + FILE_CHECK(WriteFile("./v3_i16s.bin", h_i16s_v3, SZ_I16S), "./v3_i16s.bin"); + FILE_CHECK(WriteFile("./v3_i16s_ov.bin",h_i16so_v3,SZ_I16S),"./v3_i16s_ov.bin"); + FILE_CHECK(WriteFile("./v3_i16u.bin", h_i16u_v3, SZ_I16U), "./v3_i16u.bin"); + FILE_CHECK(WriteFile("./v3_i16u_ov.bin",h_i16uo_v3,SZ_I16U),"./v3_i16u_ov.bin"); + FILE_CHECK(WriteFile("./v3_tail.bin", h_tail_v3, SZ_TAIL), "./v3_tail.bin"); cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFree(v3Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - aclrtFreeHost(v3Host); + aclrtFree(d_f32_v1); aclrtFree(d_f32_v2); aclrtFree(d_f32_v3); + aclrtFree(d_f16_v1); aclrtFree(d_f16_v2); aclrtFree(d_f16_v3); + aclrtFree(d_bf16_v1); aclrtFree(d_bf16_v2); aclrtFree(d_bf16_v3); + aclrtFree(d_x_v1); aclrtFree(d_x_v2); aclrtFree(d_x_v3); + aclrtFree(d_i16s_v1); aclrtFree(d_i16s_v2); aclrtFree(d_i16s_v3); + aclrtFree(d_i16so_v1);aclrtFree(d_i16so_v2);aclrtFree(d_i16so_v3); + aclrtFree(d_i16u_v1); aclrtFree(d_i16u_v2); aclrtFree(d_i16u_v3); + aclrtFree(d_i16uo_v1);aclrtFree(d_i16uo_v2);aclrtFree(d_i16uo_v3); + aclrtFree(d_tail_v1); aclrtFree(d_tail_v2); aclrtFree(d_tail_v3); + aclrtFreeHost(h_f32_v1); aclrtFreeHost(h_f32_v2); aclrtFreeHost(h_f32_v3); + aclrtFreeHost(h_f16_v1); aclrtFreeHost(h_f16_v2); aclrtFreeHost(h_f16_v3); + aclrtFreeHost(h_bf16_v1); aclrtFreeHost(h_bf16_v2); aclrtFreeHost(h_bf16_v3); + aclrtFreeHost(h_x_v1); aclrtFreeHost(h_x_v2); aclrtFreeHost(h_x_v3); + aclrtFreeHost(h_i16s_v1); aclrtFreeHost(h_i16s_v2); aclrtFreeHost(h_i16s_v3); + aclrtFreeHost(h_i16so_v1);aclrtFreeHost(h_i16so_v2);aclrtFreeHost(h_i16so_v3); + aclrtFreeHost(h_i16u_v1); aclrtFreeHost(h_i16u_v2); aclrtFreeHost(h_i16u_v3); + aclrtFreeHost(h_i16uo_v1);aclrtFreeHost(h_i16uo_v2);aclrtFreeHost(h_i16uo_v3); + aclrtFreeHost(h_tail_v1); aclrtFreeHost(h_tail_v2); aclrtFreeHost(h_tail_v3); if (stream != nullptr) aclrtDestroyStream(stream); if (deviceSet) diff --git a/test/vpto/cases/micro-op/binary-vector/vaddc-carry-boundary/compare.py b/test/vpto/cases/micro-op/binary-vector/vaddc-carry-boundary/compare.py deleted file mode 100755 index df15d65e4..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vaddc-carry-boundary/compare.py +++ /dev/null @@ -1,64 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/binary-vector/vaddc-carry-boundary -# family: binary-vector -# target_ops: pto.vaddc -# scenarios: core-u32-unsigned, full-mask, carry-chain - -import os -import sys -import numpy as np - - -REPEAT_BYTES = 256 -LOGICAL_ELEMS = 64 -SRC_ELEM_BYTES = 4 - - -def _ceil_div(x, y): - return (x + y - 1) // y - - -def _packed_pred_storage_bytes(logical_elems, src_elem_bytes): - repeat_elems = REPEAT_BYTES // src_elem_bytes - repeat_times = _ceil_div(logical_elems, repeat_elems) + 1 - loop_count = repeat_times // 2 - return loop_count * 16 - - -def compare_result(): - golden = np.fromfile("golden_v3.bin", dtype=np.uint32, count=64) - output = np.fromfile("v3.bin", dtype=np.uint32, count=64) - return golden.shape == output.shape and np.array_equal(golden, output) - - -def compare_carry(): - prefix_bytes = _packed_pred_storage_bytes(LOGICAL_ELEMS, SRC_ELEM_BYTES) - golden = np.fromfile("golden_v4.bin", dtype=np.uint8) - output = np.fromfile("v4.bin", dtype=np.uint8) - if golden.size < prefix_bytes or output.size < prefix_bytes: - return False - return np.array_equal(golden[:prefix_bytes], output[:prefix_bytes]) - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_result() and compare_carry() - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/binary-vector/vaddc-carry-boundary/golden.py b/test/vpto/cases/micro-op/binary-vector/vaddc-carry-boundary/golden.py deleted file mode 100644 index 253c44d2c..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vaddc-carry-boundary/golden.py +++ /dev/null @@ -1,69 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/binary-vector/vaddc-carry-boundary -# family: binary-vector -# target_ops: pto.vaddc -# scenarios: core-u32-unsigned, full-mask, carry-chain - -import argparse -from pathlib import Path - -import numpy as np - - -LANES = 64 -SEED = 19 - - -def pack_mask_nibbles(bits): - out = np.zeros(256, dtype=np.uint8) - for idx, bit in enumerate(bits): - if not bit: - continue - byte = idx // 2 - if idx % 2 == 0: - out[byte] |= np.uint8(0x1) - else: - out[byte] |= np.uint8(0x10) - return out - - -def generate(output_dir: Path, seed: int) -> None: - del seed - v1 = np.zeros(LANES, dtype=np.uint32) - v2 = np.zeros(LANES, dtype=np.uint32) - pattern_lhs = np.array([0xFFFFFFFF, 0xFFFFFFFE, 0x80000000, 0x7FFFFFFF], dtype=np.uint32) - pattern_rhs = np.array([0x00000001, 0x00000002, 0x80000000, 0x00000001], dtype=np.uint32) - reps = LANES // pattern_lhs.size - v1[:] = np.tile(pattern_lhs, reps) - v2[:] = np.tile(pattern_rhs, reps) - total = v1.astype(np.uint64) + v2.astype(np.uint64) - result = (total & np.uint64(0xFFFFFFFF)).astype(np.uint32) - carry = (total >> np.uint64(32)) != 0 - - output_dir.mkdir(parents=True, exist_ok=True) - v1.tofile(output_dir / "v1.bin") - v2.tofile(output_dir / "v2.bin") - np.zeros(LANES, dtype=np.uint32).tofile(output_dir / "v3.bin") - np.zeros(256, dtype=np.uint8).tofile(output_dir / "v4.bin") - result.tofile(output_dir / "golden_v3.bin") - pack_mask_nibbles(carry).tofile(output_dir / "golden_v4.bin") - - -def main() -> None: - parser = argparse.ArgumentParser() - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/binary-vector/vaddc-carry-boundary/kernel.pto b/test/vpto/cases/micro-op/binary-vector/vaddc-carry-boundary/kernel.pto deleted file mode 100644 index 13933a0ba..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vaddc-carry-boundary/kernel.pto +++ /dev/null @@ -1,57 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/binary-vector/vaddc-carry-boundary -// family: binary-vector -// target_ops: pto.vaddc -// scenarios: core-u32-unsigned, full-mask, carry-chain -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vaddc_carry_boundary_kernel_2d(%arg0: !pto.ptr, %arg1: !pto.ptr, - %arg2: !pto.ptr, %arg3: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c128_i64 = arith.constant 128 : i64 - %c256_i64 = arith.constant 256 : i64 - %c4096_i64 = arith.constant 4096 : i64 - %c8192_i64 = arith.constant 8192 : i64 - %c12288_i64 = arith.constant 12288 : i64 - - %ub_lhs = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_rhs = pto.castptr %c4096_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c8192_i64 : i64 -> !pto.ptr - %ub_carry = pto.castptr %c12288_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_lhs, %c0_i64, %c256_i64 - nburst(%c1_i64, %c256_i64, %c256_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - pto.mte_gm_ub %arg1, %ub_rhs, %c0_i64, %c256_i64 - nburst(%c1_i64, %c256_i64, %c256_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %mask = pto.pset_b32 "PAT_ALL" : !pto.mask - %lhs = pto.vlds %ub_lhs[%c0] : !pto.ptr -> !pto.vreg<64xui32> - %rhs = pto.vlds %ub_rhs[%c0] : !pto.ptr -> !pto.vreg<64xui32> - %sum, %carry = pto.vaddc %lhs, %rhs, %mask : !pto.vreg<64xui32>, !pto.vreg<64xui32>, !pto.mask -> !pto.vreg<64xui32>, !pto.mask - pto.vsts %sum, %ub_out[%c0], %mask : !pto.vreg<64xui32>, !pto.ptr, !pto.mask - pto.psti %carry, %ub_carry[%c0], "NORM" : !pto.mask, !pto.ptr, index - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg2, %c256_i64 - nburst(%c1_i64, %c256_i64, %c256_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.mte_ub_gm %ub_carry, %arg3, %c128_i64 - nburst(%c1_i64, %c256_i64, %c256_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/binary-vector/vaddc-carry-boundary/launch.cpp b/test/vpto/cases/micro-op/binary-vector/vaddc-carry-boundary/launch.cpp deleted file mode 100644 index 9c3f6d2c9..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vaddc-carry-boundary/launch.cpp +++ /dev/null @@ -1,56 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/binary-vector/vaddc-carry-boundary -// family: binary-vector -// target_ops: pto.vaddc -// scenarios: core-u32-unsigned, full-mask, carry-chain -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void -vaddc_carry_boundary_kernel_2d(__gm__ uint32_t *v1, __gm__ uint32_t *v2, - __gm__ uint32_t *v3, __gm__ uint8_t *v4); - -void LaunchVaddc_carry_boundary_kernel_2d(uint32_t *v1, uint32_t *v2, - uint32_t *v3, uint8_t *v4, - void *stream) { - vaddc_carry_boundary_kernel_2d<<<1, nullptr, stream>>>( - (__gm__ uint32_t *)v1, (__gm__ uint32_t *)v2, (__gm__ uint32_t *)v3, - (__gm__ uint8_t *)v4); -} diff --git a/test/vpto/cases/micro-op/binary-vector/vaddc-carry-boundary/main.cpp b/test/vpto/cases/micro-op/binary-vector/vaddc-carry-boundary/main.cpp deleted file mode 100644 index 486a7314a..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vaddc-carry-boundary/main.cpp +++ /dev/null @@ -1,119 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/binary-vector/vaddc-carry-boundary -// family: binary-vector -// target_ops: pto.vaddc -// scenarios: core-u32-unsigned, full-mask, carry-chain -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include - -using namespace PtoTestCommon; - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, \ - (int)_ret, __FILE__, __LINE__); \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVaddc_carry_boundary_kernel_2d(uint32_t *v1, uint32_t *v2, - uint32_t *v3, uint8_t *v4, - void *stream); - -int main() { - size_t elemCount_v1 = 64; - size_t fileSize_v1 = elemCount_v1 * sizeof(uint32_t); - size_t elemCount_v2 = 64; - size_t fileSize_v2 = elemCount_v2 * sizeof(uint32_t); - size_t elemCount_v3 = 64; - size_t fileSize_v3 = elemCount_v3 * sizeof(uint32_t); - size_t elemCount_v4 = 256; - size_t fileSize_v4 = elemCount_v4 * sizeof(uint8_t); - uint32_t *v1Host = nullptr; - uint32_t *v1Device = nullptr; - uint32_t *v2Host = nullptr; - uint32_t *v2Device = nullptr; - uint32_t *v3Host = nullptr; - uint32_t *v3Device = nullptr; - uint8_t *v4Host = nullptr; - uint8_t *v4Device = nullptr; - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) - deviceId = std::atoi(envDevice); - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMallocHost((void **)(&v3Host), fileSize_v3)); - ACL_CHECK(aclrtMallocHost((void **)(&v4Host), fileSize_v4)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v3Device, fileSize_v3, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v4Device, fileSize_v4, ACL_MEM_MALLOC_HUGE_FIRST)); - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ReadFile("./v3.bin", fileSize_v3, v3Host, fileSize_v3); - ReadFile("./v4.bin", fileSize_v4, v4Host, fileSize_v4); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v3Device, fileSize_v3, v3Host, fileSize_v3, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v4Device, fileSize_v4, v4Host, fileSize_v4, - ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVaddc_carry_boundary_kernel_2d(v1Device, v2Device, v3Device, v4Device, - stream); - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v3Host, fileSize_v3, v3Device, fileSize_v3, - ACL_MEMCPY_DEVICE_TO_HOST)); - ACL_CHECK(aclrtMemcpy(v4Host, fileSize_v4, v4Device, fileSize_v4, - ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v3.bin", v3Host, fileSize_v3); - WriteFile("./v4.bin", v4Host, fileSize_v4); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFree(v3Device); - aclrtFree(v4Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - aclrtFreeHost(v3Host); - aclrtFreeHost(v4Host); - if (stream != nullptr) - aclrtDestroyStream(stream); - if (deviceSet) - aclrtResetDevice(deviceId); - if (aclInited) - aclFinalize(); - return rc; -} diff --git a/test/vpto/cases/micro-op/binary-vector/vaddc/kernel.pto b/test/vpto/cases/micro-op/binary-vector/vaddc/kernel.pto index 74554c755..00af45658 100644 --- a/test/vpto/cases/micro-op/binary-vector/vaddc/kernel.pto +++ b/test/vpto/cases/micro-op/binary-vector/vaddc/kernel.pto @@ -1,11 +1,3 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/binary-vector/vaddc -// family: binary-vector -// target_ops: pto.vaddc -// scenarios: core-u32-unsigned, full-mask, carry-chain -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { func.func @vaddc_kernel_2d(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: !pto.ptr) attributes {pto.kernel} { @@ -51,6 +43,152 @@ module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind, !pto.ptr, i64, i64, i64, i64 pto.barrier #pto.pipe + %__case_merge_guard = arith.constant false + + // inactive cross-case merge from micro-op/binary-vector/vaddc-carry-boundary + scf.if %__case_merge_guard { + + %c0_cmg0_1 = arith.constant 0 : index + %c0_i64_cmg0_1 = arith.constant 0 : i64 + %c1_i64_cmg0_1 = arith.constant 1 : i64 + %c128_i64_cmg0_1 = arith.constant 128 : i64 + %c256_i64_cmg0_1 = arith.constant 256 : i64 + %c4096_i64_cmg0_1 = arith.constant 4096 : i64 + %c8192_i64_cmg0_1 = arith.constant 8192 : i64 + %c12288_i64_cmg0_1 = arith.constant 12288 : i64 + + %ub_lhs_cmg0_1 = pto.castptr %c0_i64_cmg0_1 : i64 -> !pto.ptr + %ub_rhs_cmg0_1 = pto.castptr %c4096_i64_cmg0_1 : i64 -> !pto.ptr + %ub_out_cmg0_1 = pto.castptr %c8192_i64_cmg0_1 : i64 -> !pto.ptr + %ub_carry_cmg0_1 = pto.castptr %c12288_i64_cmg0_1 : i64 -> !pto.ptr + + %false_cmg0_1 = arith.constant false + pto.mte_gm_ub %arg0, %ub_lhs_cmg0_1, %c0_i64_cmg0_1, %c256_i64_cmg0_1 + nburst(%c1_i64_cmg0_1, %c256_i64_cmg0_1, %c256_i64_cmg0_1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + pto.mte_gm_ub %arg1, %ub_rhs_cmg0_1, %c0_i64_cmg0_1, %c256_i64_cmg0_1 + nburst(%c1_i64_cmg0_1, %c256_i64_cmg0_1, %c256_i64_cmg0_1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %mask_cmg0_1 = pto.pset_b32 "PAT_ALL" : !pto.mask + %lhs_cmg0_1 = pto.vlds %ub_lhs_cmg0_1[%c0_cmg0_1] : !pto.ptr -> !pto.vreg<64xui32> + %rhs_cmg0_1 = pto.vlds %ub_rhs_cmg0_1[%c0_cmg0_1] : !pto.ptr -> !pto.vreg<64xui32> + %sum_cmg0_1, %carry_cmg0_1 = pto.vaddc %lhs_cmg0_1, %rhs_cmg0_1, %mask_cmg0_1 : !pto.vreg<64xui32>, !pto.vreg<64xui32>, !pto.mask -> !pto.vreg<64xui32>, !pto.mask + pto.vsts %sum_cmg0_1, %ub_out_cmg0_1[%c0_cmg0_1], %mask_cmg0_1 : !pto.vreg<64xui32>, !pto.ptr, !pto.mask + pto.psti %carry_cmg0_1, %ub_carry_cmg0_1[%c0_cmg0_1], "NORM" : !pto.mask, !pto.ptr, index + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_cmg0_1, %arg2, %c256_i64_cmg0_1 + nburst(%c1_i64_cmg0_1, %c256_i64_cmg0_1, %c256_i64_cmg0_1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.mte_ub_gm %ub_carry_cmg0_1, %arg3, %c128_i64_cmg0_1 + nburst(%c1_i64_cmg0_1, %c256_i64_cmg0_1, %c256_i64_cmg0_1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + } + + // inactive cross-case merge from micro-op/binary-vector/vsubc + scf.if %__case_merge_guard { + + %c0_cmg0_2 = arith.constant 0 : index + %c0_i64_cmg0_2 = arith.constant 0 : i64 + %c1_i64_cmg0_2 = arith.constant 1 : i64 + %c128_i64_cmg0_2 = arith.constant 128 : i64 + %c256_i64_cmg0_2 = arith.constant 256 : i64 + %c4096_i64_cmg0_2 = arith.constant 4096 : i64 + %c8192_i64_cmg0_2 = arith.constant 8192 : i64 + %c12288_i64_cmg0_2 = arith.constant 12288 : i64 + + %ub_lhs_cmg0_2 = pto.castptr %c0_i64_cmg0_2 : i64 -> !pto.ptr + %ub_rhs_cmg0_2 = pto.castptr %c4096_i64_cmg0_2 : i64 -> !pto.ptr + %ub_out_cmg0_2 = pto.castptr %c8192_i64_cmg0_2 : i64 -> !pto.ptr + %ub_borrow_cmg0_2 = pto.castptr %c12288_i64_cmg0_2 : i64 -> !pto.ptr + + %false_cmg0_2 = arith.constant false + pto.mte_gm_ub %arg0, %ub_lhs_cmg0_2, %c0_i64_cmg0_2, %c256_i64_cmg0_2 + nburst(%c1_i64_cmg0_2, %c256_i64_cmg0_2, %c256_i64_cmg0_2) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + pto.mte_gm_ub %arg1, %ub_rhs_cmg0_2, %c0_i64_cmg0_2, %c256_i64_cmg0_2 + nburst(%c1_i64_cmg0_2, %c256_i64_cmg0_2, %c256_i64_cmg0_2) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %mask_cmg0_2 = pto.pset_b32 "PAT_ALL" : !pto.mask + %lhs_cmg0_2 = pto.vlds %ub_lhs_cmg0_2[%c0_cmg0_2] : !pto.ptr -> !pto.vreg<64xui32> + %rhs_cmg0_2 = pto.vlds %ub_rhs_cmg0_2[%c0_cmg0_2] : !pto.ptr -> !pto.vreg<64xui32> + %diff_cmg0_2, %borrow_cmg0_2 = pto.vsubc %lhs_cmg0_2, %rhs_cmg0_2, %mask_cmg0_2 : !pto.vreg<64xui32>, !pto.vreg<64xui32>, !pto.mask -> !pto.vreg<64xui32>, !pto.mask + pto.vsts %diff_cmg0_2, %ub_out_cmg0_2[%c0_cmg0_2], %mask_cmg0_2 : !pto.vreg<64xui32>, !pto.ptr, !pto.mask + pto.psti %borrow_cmg0_2, %ub_borrow_cmg0_2[%c0_cmg0_2], "NORM" : !pto.mask, !pto.ptr, index + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_cmg0_2, %arg2, %c256_i64_cmg0_2 + nburst(%c1_i64_cmg0_2, %c256_i64_cmg0_2, %c256_i64_cmg0_2) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.mte_ub_gm %ub_borrow_cmg0_2, %arg3, %c128_i64_cmg0_2 + nburst(%c1_i64_cmg0_2, %c256_i64_cmg0_2, %c256_i64_cmg0_2) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + } + + // inactive cross-case merge from micro-op/binary-vector/vsubc-borrow-boundary + scf.if %__case_merge_guard { + + %c0_cmg0_3 = arith.constant 0 : index + %c0_i64_cmg0_3 = arith.constant 0 : i64 + %c1_i64_cmg0_3 = arith.constant 1 : i64 + %c128_i64_cmg0_3 = arith.constant 128 : i64 + %c256_i64_cmg0_3 = arith.constant 256 : i64 + %c4096_i64_cmg0_3 = arith.constant 4096 : i64 + %c8192_i64_cmg0_3 = arith.constant 8192 : i64 + %c12288_i64_cmg0_3 = arith.constant 12288 : i64 + + %ub_lhs_cmg0_3 = pto.castptr %c0_i64_cmg0_3 : i64 -> !pto.ptr + %ub_rhs_cmg0_3 = pto.castptr %c4096_i64_cmg0_3 : i64 -> !pto.ptr + %ub_out_cmg0_3 = pto.castptr %c8192_i64_cmg0_3 : i64 -> !pto.ptr + %ub_borrow_cmg0_3 = pto.castptr %c12288_i64_cmg0_3 : i64 -> !pto.ptr + + %false_cmg0_3 = arith.constant false + pto.mte_gm_ub %arg0, %ub_lhs_cmg0_3, %c0_i64_cmg0_3, %c256_i64_cmg0_3 + nburst(%c1_i64_cmg0_3, %c256_i64_cmg0_3, %c256_i64_cmg0_3) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + pto.mte_gm_ub %arg1, %ub_rhs_cmg0_3, %c0_i64_cmg0_3, %c256_i64_cmg0_3 + nburst(%c1_i64_cmg0_3, %c256_i64_cmg0_3, %c256_i64_cmg0_3) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %mask_cmg0_3 = pto.pset_b32 "PAT_ALL" : !pto.mask + %lhs_cmg0_3 = pto.vlds %ub_lhs_cmg0_3[%c0_cmg0_3] : !pto.ptr -> !pto.vreg<64xui32> + %rhs_cmg0_3 = pto.vlds %ub_rhs_cmg0_3[%c0_cmg0_3] : !pto.ptr -> !pto.vreg<64xui32> + %diff_cmg0_3, %borrow_cmg0_3 = pto.vsubc %lhs_cmg0_3, %rhs_cmg0_3, %mask_cmg0_3 : !pto.vreg<64xui32>, !pto.vreg<64xui32>, !pto.mask -> !pto.vreg<64xui32>, !pto.mask + pto.vsts %diff_cmg0_3, %ub_out_cmg0_3[%c0_cmg0_3], %mask_cmg0_3 : !pto.vreg<64xui32>, !pto.ptr, !pto.mask + pto.psti %borrow_cmg0_3, %ub_borrow_cmg0_3[%c0_cmg0_3], "NORM" : !pto.mask, !pto.ptr, index + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_cmg0_3, %arg2, %c256_i64_cmg0_3 + nburst(%c1_i64_cmg0_3, %c256_i64_cmg0_3, %c256_i64_cmg0_3) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.mte_ub_gm %ub_borrow_cmg0_3, %arg3, %c128_i64_cmg0_3 + nburst(%c1_i64_cmg0_3, %c256_i64_cmg0_3, %c256_i64_cmg0_3) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + } return } + } diff --git a/test/vpto/cases/micro-op/binary-vector/vand-mask-edge/compare.py b/test/vpto/cases/micro-op/binary-vector/vand-mask-edge/compare.py deleted file mode 100755 index f42233bb4..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vand-mask-edge/compare.py +++ /dev/null @@ -1,44 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/binary-vector/vand-mask-edge -# family: binary-vector -# target_ops: pto.vand -# scenarios: core-i16-unsigned, full-mask -# NOTE: bulk-generated coverage skeleton. - -import os -import sys -import numpy as np - - -def compare_bin_prefix(golden_path, output_path, dtype, eps, count): - if not os.path.exists(golden_path) or not os.path.exists(output_path): - return False - golden = np.fromfile(golden_path, dtype=dtype, count=count) - output = np.fromfile(output_path, dtype=dtype, count=count) - return golden.shape == output.shape and np.allclose( - golden, output, atol=eps, rtol=eps, equal_nan=True - ) - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_bin_prefix("golden_v3.bin", "v3.bin", np.uint16, 0, 1024) - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/binary-vector/vand-mask-edge/golden.py b/test/vpto/cases/micro-op/binary-vector/vand-mask-edge/golden.py deleted file mode 100755 index 27a700901..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vand-mask-edge/golden.py +++ /dev/null @@ -1,50 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/binary-vector/vand-mask-edge -# family: binary-vector -# target_ops: pto.vand -# scenarios: core-i16-unsigned, full-mask -# NOTE: bulk-generated coverage skeleton. -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -ELEMS = 1024 -SEED = 19 - - -def generate(output_dir: Path, seed: int) -> None: - idx = np.arange(ELEMS, dtype=np.uint16) - v1 = np.where((idx & 1) == 0, np.uint16(0xAAAA), np.uint16(0x0F0F)).astype(np.uint16, copy=False) - v2 = np.where((idx & 2) == 0, np.uint16(0x5555), np.uint16(0x3333)).astype(np.uint16, copy=False) - v3 = np.zeros(ELEMS, dtype=np.uint16) - golden_v3 = np.bitwise_and(v1, v2).astype(np.uint16, copy=False) - - output_dir.mkdir(parents=True, exist_ok=True) - v1.tofile(output_dir / "v1.bin") - v2.tofile(output_dir / "v2.bin") - v3.tofile(output_dir / "v3.bin") - golden_v3.tofile(output_dir / "golden_v3.bin") - - -def main() -> None: - parser = argparse.ArgumentParser() - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/binary-vector/vand-mask-edge/kernel.pto b/test/vpto/cases/micro-op/binary-vector/vand-mask-edge/kernel.pto deleted file mode 100644 index 908bcb94c..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vand-mask-edge/kernel.pto +++ /dev/null @@ -1,55 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/binary-vector/vand-mask-edge -// family: binary-vector -// target_ops: pto.vand -// scenarios: core-i16-unsigned, full-mask -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vand_mask_edge_kernel(%arg0: !pto.ptr, %arg1: !pto.ptr, - %arg2: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c64_i64 = arith.constant 64 : i64 - %c1024 = arith.constant 1024 : index - %c128 = arith.constant 128 : index - %c2048_i64 = arith.constant 2048 : i64 - %c4096_i64 = arith.constant 4096 : i64 - - %ub_lhs = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_rhs = pto.castptr %c2048_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c4096_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_lhs, %c0_i64, %c64_i64 - nburst(%c32_i64, %c64_i64, %c64_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - pto.mte_gm_ub %arg1, %ub_rhs, %c0_i64, %c64_i64 - nburst(%c32_i64, %c64_i64, %c64_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %mask = pto.pset_b16 "PAT_ALL" : !pto.mask - scf.for %offset = %c0 to %c1024 step %c128 { - %lhs = pto.vlds %ub_lhs[%offset] : !pto.ptr -> !pto.vreg<128xui16> - %rhs = pto.vlds %ub_rhs[%offset] : !pto.ptr -> !pto.vreg<128xui16> - %out = pto.vand %lhs, %rhs, %mask : !pto.vreg<128xui16>, !pto.vreg<128xui16>, !pto.mask -> !pto.vreg<128xui16> - pto.vsts %out, %ub_out[%offset], %mask : !pto.vreg<128xui16>, !pto.ptr, !pto.mask - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg2, %c64_i64 - nburst(%c32_i64, %c64_i64, %c64_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/binary-vector/vand-mask-edge/launch.cpp b/test/vpto/cases/micro-op/binary-vector/vand-mask-edge/launch.cpp deleted file mode 100644 index 3924e63d3..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vand-mask-edge/launch.cpp +++ /dev/null @@ -1,55 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/binary-vector/vand-mask-edge -// family: binary-vector -// target_ops: pto.vand -// scenarios: core-i16-unsigned, full-mask -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vand_mask_edge_kernel(__gm__ uint16_t *v1, - __gm__ uint16_t *v2, - __gm__ uint16_t *v3); - -void LaunchVand_mask_edge_kernel(uint16_t *v1, uint16_t *v2, uint16_t *v3, - void *stream) { - vand_mask_edge_kernel<<<1, nullptr, stream>>>((__gm__ uint16_t *)v1, - (__gm__ uint16_t *)v2, - (__gm__ uint16_t *)v3); -} diff --git a/test/vpto/cases/micro-op/binary-vector/vand-mask-edge/main.cpp b/test/vpto/cases/micro-op/binary-vector/vand-mask-edge/main.cpp deleted file mode 100644 index eae6df992..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vand-mask-edge/main.cpp +++ /dev/null @@ -1,103 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/binary-vector/vand-mask-edge -// family: binary-vector -// target_ops: pto.vand -// scenarios: core-i16-unsigned, full-mask -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include - -using namespace PtoTestCommon; - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, \ - (int)_ret, __FILE__, __LINE__); \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVand_mask_edge_kernel(uint16_t *v1, uint16_t *v2, uint16_t *v3, - void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(uint16_t); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(uint16_t); - size_t elemCount_v3 = 1024; - size_t fileSize_v3 = elemCount_v3 * sizeof(uint16_t); - uint16_t *v1Host = nullptr; - uint16_t *v1Device = nullptr; - uint16_t *v2Host = nullptr; - uint16_t *v2Device = nullptr; - uint16_t *v3Host = nullptr; - uint16_t *v3Device = nullptr; - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) - deviceId = std::atoi(envDevice); - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMallocHost((void **)(&v3Host), fileSize_v3)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v3Device, fileSize_v3, ACL_MEM_MALLOC_HUGE_FIRST)); - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ReadFile("./v3.bin", fileSize_v3, v3Host, fileSize_v3); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v3Device, fileSize_v3, v3Host, fileSize_v3, - ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVand_mask_edge_kernel(v1Device, v2Device, v3Device, stream); - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v3Host, fileSize_v3, v3Device, fileSize_v3, - ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v3.bin", v3Host, fileSize_v3); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFree(v3Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - aclrtFreeHost(v3Host); - if (stream != nullptr) - aclrtDestroyStream(stream); - if (deviceSet) - aclrtResetDevice(deviceId); - if (aclInited) - aclFinalize(); - return rc; -} diff --git a/test/vpto/cases/micro-op/binary-vector/vand/compare.py b/test/vpto/cases/micro-op/binary-vector/vand/compare.py index 28c2a232c..0da47cbbd 100755 --- a/test/vpto/cases/micro-op/binary-vector/vand/compare.py +++ b/test/vpto/cases/micro-op/binary-vector/vand/compare.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python3 +#!/usr/bin/python3 # Copyright (c) 2026 Huawei Technologies Co., Ltd. # This program is free software, you can redistribute it and/or modify it under the terms and conditions of # CANN Open Software License Agreement Version 2.0 (the "License"). @@ -7,36 +7,42 @@ # INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. # See LICENSE in the root of the software repository for the full text of the License. -# case: micro-op/binary-vector/vand -# family: binary-vector -# target_ops: pto.vand -# scenarios: core-i16-unsigned, full-mask -import os -import sys -import numpy as np +# Merged vand test case. +import os,sys +import numpy as np -def compare_bin_prefix(golden_path, output_path, dtype, eps, count): - if not os.path.exists(golden_path) or not os.path.exists(output_path): - return False - golden = np.fromfile(golden_path, dtype=dtype, count=count) - output = np.fromfile(output_path, dtype=dtype, count=count) - return golden.shape == output.shape and np.allclose( - golden, output, atol=eps, rtol=eps, equal_nan=True - ) +def _cmp(golden,output,dtype,eps,count=-1): + if not os.path.exists(golden) or not os.path.exists(output): return False + kw={} if count<0 else {"count":count} + g=np.fromfile(golden,dtype=dtype,**kw) + o=np.fromfile(output,dtype=dtype,**kw) + return g.shape==o.shape and np.allclose(g,o,atol=eps,rtol=eps,equal_nan=True) +def _cmpeq(golden,output,dtype): + if not os.path.exists(golden) or not os.path.exists(output): return False + g=np.fromfile(golden,dtype=dtype) + o=np.fromfile(output,dtype=dtype) + return g.shape==o.shape and np.array_equal(g,o) def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_bin_prefix("golden_v3.bin", "v3.bin", np.uint16, 0, 1024) - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") + strict=os.getenv('COMPARE_STRICT','1')!='0' + failed=[] + if not (_cmp("golden_v3.bin","v3.bin",np.uint16,0,1024)): + failed.append('f32') + print('[ERROR] compare failed: f32') + else: + print('[INFO] f32: passed') + if not (_cmpeq("golden_v3_mask_edge.bin","v3_mask_edge.bin",np.uint16)): + failed.append('mask_edge') + print('[ERROR] compare failed: mask_edge') + else: + print('[INFO] mask_edge: passed') + if failed: + if strict: print(f"[ERROR] {len(failed)} variant(s) failed"); sys.exit(2) + print(f"[WARN] {len(failed)} variant(s) failed (non-gating)") return - print("[INFO] compare passed") - + print("[INFO] compare passed (all 2 variants)") -if __name__ == "__main__": +if __name__=="__main__": main() diff --git a/test/vpto/cases/micro-op/binary-vector/vand/golden.py b/test/vpto/cases/micro-op/binary-vector/vand/golden.py index a67709b57..ea42a1d23 100755 --- a/test/vpto/cases/micro-op/binary-vector/vand/golden.py +++ b/test/vpto/cases/micro-op/binary-vector/vand/golden.py @@ -7,43 +7,62 @@ # INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. # See LICENSE in the root of the software repository for the full text of the License. -# case: micro-op/binary-vector/vand -# family: binary-vector -# target_ops: pto.vand -# scenarios: core-i16-unsigned, full-mask -# coding=utf-8 +# Merged vand test case. import argparse from pathlib import Path - import numpy as np - -ELEMS = 1024 +ROWS = 32 +COLS = 32 SEED = 19 +LOGICAL_ELEMS = 1000 +OUT_SENTINEL = np.float32(-123.25) +def f32_to_bf16_bits(v): + w=v.astype(np.float32,copy=False).view(np.uint32) + r=np.uint32(0x7FFF)+((w>>16)&np.uint32(1)) + return ((w+r)>>16).astype(np.uint16) +def bf16_bits_to_f32(b): + return (b.astype(np.uint32)<<16).view(np.float32) -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) - v1 = rng.integers(0, 0x10000, size=ELEMS, dtype=np.uint16) - v2 = rng.integers(0, 0x10000, size=ELEMS, dtype=np.uint16) - v3 = np.zeros(ELEMS, dtype=np.uint16) - golden_v3 = np.bitwise_and(v1, v2).astype(np.uint16, copy=False) - output_dir.mkdir(parents=True, exist_ok=True) - v1.tofile(output_dir / "v1.bin") - v2.tofile(output_dir / "v2.bin") - v3.tofile(output_dir / "v3.bin") - golden_v3.tofile(output_dir / "golden_v3.bin") +# ---- f32 ---- +def gen_f32(out, rng): + v1=rng.integers(0,0x10000,size=(ROWS,COLS),dtype=np.uint16) + v2=rng.integers(0,0x10000,size=(ROWS,COLS),dtype=np.uint16) + g=np.bitwise_and(v1,v2) + v3=np.zeros((ROWS,COLS),dtype=np.uint16) + v1.reshape(-1).tofile(out/"v1.bin") + v2.reshape(-1).tofile(out/"v2.bin") + v3.reshape(-1).tofile(out/"v3.bin") + g.reshape(-1).tofile(out/"golden_v3.bin") +# ---- mask_edge ---- +def gen_mask_edge(out, rng): + v1=rng.integers(0,0x10000,size=(ROWS,COLS),dtype=np.uint16) + v2=rng.integers(0,0x10000,size=(ROWS,COLS),dtype=np.uint16) + g=np.bitwise_and(v1,v2) + v3=np.zeros((ROWS,COLS),dtype=np.uint16) + v1.reshape(-1).tofile(out/"v1_mask_edge.bin") + v2.reshape(-1).tofile(out/"v2_mask_edge.bin") + v3.reshape(-1).tofile(out/"v3_mask_edge.bin") + g.reshape(-1).tofile(out/"golden_v3_mask_edge.bin") -def main() -> None: - parser = argparse.ArgumentParser() - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) +GENERATORS = [ + gen_f32, + gen_mask_edge, +] +def main(): + p=argparse.ArgumentParser() + p.add_argument("--output-dir",type=Path,default=Path(".")) + p.add_argument("--seed",type=int,default=SEED) + a=p.parse_args() + rng=np.random.default_rng(a.seed) + out=a.output_dir; out.mkdir(parents=True,exist_ok=True) + for gen in GENERATORS: + gen(out,rng) -if __name__ == "__main__": +if __name__=="__main__": main() diff --git a/test/vpto/cases/micro-op/binary-vector/vand/kernel.pto b/test/vpto/cases/micro-op/binary-vector/vand/kernel.pto index 8042c9550..c67f7f618 100644 --- a/test/vpto/cases/micro-op/binary-vector/vand/kernel.pto +++ b/test/vpto/cases/micro-op/binary-vector/vand/kernel.pto @@ -1,53 +1,186 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/binary-vector/vand -// family: binary-vector -// target_ops: pto.vand -// scenarios: core-i16-unsigned, full-mask -// ----------------------------------------------------------------------------- module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vand_i16_unsigned_kernel(%arg0: !pto.ptr, %arg1: !pto.ptr, - %arg2: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1024 = arith.constant 1024 : index - %c128 = arith.constant 128 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c64_i64 = arith.constant 64 : i64 - %c2048_i64 = arith.constant 2048 : i64 - %c4096_i64 = arith.constant 4096 : i64 - - %ub_lhs = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_rhs = pto.castptr %c2048_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c4096_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_lhs, %c0_i64, %c64_i64 - nburst(%c32_i64, %c64_i64, %c64_i64) + func.func @vand_deep_merged_kernel(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: !pto.ptr, %arg4: !pto.ptr, %arg5: !pto.ptr) attributes {pto.kernel} { + // merged from vand_i16_unsigned_kernel via LaunchVand_i16_unsigned_kernel + + %c0_m0 = arith.constant 0 : index + %c1024_m0 = arith.constant 1024 : index + %c128_m0 = arith.constant 128 : index + %c0_i64_m0 = arith.constant 0 : i64 + %c1_i64_m0 = arith.constant 1 : i64 + %c32_i64_m0 = arith.constant 32 : i64 + %c64_i64_m0 = arith.constant 64 : i64 + %c2048_i64_m0 = arith.constant 2048 : i64 + %c4096_i64_m0 = arith.constant 4096 : i64 + + %ub_lhs_m0 = pto.castptr %c0_i64_m0 : i64 -> !pto.ptr + %ub_rhs_m0 = pto.castptr %c2048_i64_m0 : i64 -> !pto.ptr + %ub_out_m0 = pto.castptr %c4096_i64_m0 : i64 -> !pto.ptr + + %false_m0 = arith.constant false + pto.mte_gm_ub %arg0, %ub_lhs_m0, %c0_i64_m0, %c64_i64_m0 + nburst(%c32_i64_m0, %c64_i64_m0, %c64_i64_m0) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + pto.mte_gm_ub %arg1, %ub_rhs_m0, %c0_i64_m0, %c64_i64_m0 + nburst(%c32_i64_m0, %c64_i64_m0, %c64_i64_m0) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %mask_m0 = pto.pset_b16 "PAT_ALL" : !pto.mask + scf.for %offset_m0 = %c0_m0 to %c1024_m0 step %c128_m0 { + %lhs_m0 = pto.vlds %ub_lhs_m0[%offset_m0] : !pto.ptr -> !pto.vreg<128xui16> + %rhs_m0 = pto.vlds %ub_rhs_m0[%offset_m0] : !pto.ptr -> !pto.vreg<128xui16> + %out_m0 = pto.vand %lhs_m0, %rhs_m0, %mask_m0 : !pto.vreg<128xui16>, !pto.vreg<128xui16>, !pto.mask -> !pto.vreg<128xui16> + pto.vsts %out_m0, %ub_out_m0[%offset_m0], %mask_m0 : !pto.vreg<128xui16>, !pto.ptr, !pto.mask + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m0, %arg2, %c64_i64_m0 + nburst(%c32_i64_m0, %c64_i64_m0, %c64_i64_m0) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + + // merged from vand_mask_edge_kernel via LaunchVand_mask_edge_kernel + + %c0_m1 = arith.constant 0 : index + %c0_i64_m1 = arith.constant 0 : i64 + %c1_i64_m1 = arith.constant 1 : i64 + %c32_i64_m1 = arith.constant 32 : i64 + %c64_i64_m1 = arith.constant 64 : i64 + %c1024_m1 = arith.constant 1024 : index + %c128_m1 = arith.constant 128 : index + %c2048_i64_m1 = arith.constant 2048 : i64 + %c4096_i64_m1 = arith.constant 4096 : i64 + + %ub_lhs_m1 = pto.castptr %c0_i64_m1 : i64 -> !pto.ptr + %ub_rhs_m1 = pto.castptr %c2048_i64_m1 : i64 -> !pto.ptr + %ub_out_m1 = pto.castptr %c4096_i64_m1 : i64 -> !pto.ptr + + %false_m1 = arith.constant false + pto.mte_gm_ub %arg3, %ub_lhs_m1, %c0_i64_m1, %c64_i64_m1 + nburst(%c32_i64_m1, %c64_i64_m1, %c64_i64_m1) : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - pto.mte_gm_ub %arg1, %ub_rhs, %c0_i64, %c64_i64 - nburst(%c32_i64, %c64_i64, %c64_i64) + pto.mte_gm_ub %arg4, %ub_rhs_m1, %c0_i64_m1, %c64_i64_m1 + nburst(%c32_i64_m1, %c64_i64_m1, %c64_i64_m1) : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] pto.vecscope { - %mask = pto.pset_b16 "PAT_ALL" : !pto.mask - scf.for %offset = %c0 to %c1024 step %c128 { - %lhs = pto.vlds %ub_lhs[%offset] : !pto.ptr -> !pto.vreg<128xui16> - %rhs = pto.vlds %ub_rhs[%offset] : !pto.ptr -> !pto.vreg<128xui16> - %out = pto.vand %lhs, %rhs, %mask : !pto.vreg<128xui16>, !pto.vreg<128xui16>, !pto.mask -> !pto.vreg<128xui16> - pto.vsts %out, %ub_out[%offset], %mask : !pto.vreg<128xui16>, !pto.ptr, !pto.mask + %mask_m1 = pto.pset_b16 "PAT_ALL" : !pto.mask + scf.for %offset_m1 = %c0_m1 to %c1024_m1 step %c128_m1 { + %lhs_m1 = pto.vlds %ub_lhs_m1[%offset_m1] : !pto.ptr -> !pto.vreg<128xui16> + %rhs_m1 = pto.vlds %ub_rhs_m1[%offset_m1] : !pto.ptr -> !pto.vreg<128xui16> + %out_m1 = pto.vand %lhs_m1, %rhs_m1, %mask_m1 : !pto.vreg<128xui16>, !pto.vreg<128xui16>, !pto.mask -> !pto.vreg<128xui16> + pto.vsts %out_m1, %ub_out_m1[%offset_m1], %mask_m1 : !pto.vreg<128xui16>, !pto.ptr, !pto.mask } } pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg2, %c64_i64 - nburst(%c32_i64, %c64_i64, %c64_i64) + pto.mte_ub_gm %ub_out_m1, %arg5, %c64_i64_m1 + nburst(%c32_i64_m1, %c64_i64_m1, %c64_i64_m1) : !pto.ptr, !pto.ptr, i64, i64, i64, i64 pto.barrier #pto.pipe + %__case_merge_guard = arith.constant false + + // inactive cross-case merge from micro-op/binary-vector/vxor + scf.if %__case_merge_guard { + + // merged from vxor_i16_unsigned_kernel via LaunchVxor_i16_unsigned_kernel + + %c0_m0_cmg1_1 = arith.constant 0 : index + %c1024_m0_cmg1_1 = arith.constant 1024 : index + %c128_m0_cmg1_1 = arith.constant 128 : index + %c0_i64_m0_cmg1_1 = arith.constant 0 : i64 + %c1_i64_m0_cmg1_1 = arith.constant 1 : i64 + %c32_i64_m0_cmg1_1 = arith.constant 32 : i64 + %c64_i64_m0_cmg1_1 = arith.constant 64 : i64 + %c2048_i64_m0_cmg1_1 = arith.constant 2048 : i64 + %c4096_i64_m0_cmg1_1 = arith.constant 4096 : i64 + + %ub_lhs_m0_cmg1_1 = pto.castptr %c0_i64_m0_cmg1_1 : i64 -> !pto.ptr + %ub_rhs_m0_cmg1_1 = pto.castptr %c2048_i64_m0_cmg1_1 : i64 -> !pto.ptr + %ub_out_m0_cmg1_1 = pto.castptr %c4096_i64_m0_cmg1_1 : i64 -> !pto.ptr + + %false_m0_cmg1_1 = arith.constant false + pto.mte_gm_ub %arg0, %ub_lhs_m0_cmg1_1, %c0_i64_m0_cmg1_1, %c64_i64_m0_cmg1_1 + nburst(%c32_i64_m0_cmg1_1, %c64_i64_m0_cmg1_1, %c64_i64_m0_cmg1_1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + pto.mte_gm_ub %arg1, %ub_rhs_m0_cmg1_1, %c0_i64_m0_cmg1_1, %c64_i64_m0_cmg1_1 + nburst(%c32_i64_m0_cmg1_1, %c64_i64_m0_cmg1_1, %c64_i64_m0_cmg1_1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %mask_m0_cmg1_1 = pto.pset_b16 "PAT_ALL" : !pto.mask + scf.for %offset_m0_cmg1_1 = %c0_m0_cmg1_1 to %c1024_m0_cmg1_1 step %c128_m0_cmg1_1 { + %lhs_m0_cmg1_1 = pto.vlds %ub_lhs_m0_cmg1_1[%offset_m0_cmg1_1] : !pto.ptr -> !pto.vreg<128xui16> + %rhs_m0_cmg1_1 = pto.vlds %ub_rhs_m0_cmg1_1[%offset_m0_cmg1_1] : !pto.ptr -> !pto.vreg<128xui16> + %out_m0_cmg1_1 = pto.vxor %lhs_m0_cmg1_1, %rhs_m0_cmg1_1, %mask_m0_cmg1_1 : !pto.vreg<128xui16>, !pto.vreg<128xui16>, !pto.mask -> !pto.vreg<128xui16> + pto.vsts %out_m0_cmg1_1, %ub_out_m0_cmg1_1[%offset_m0_cmg1_1], %mask_m0_cmg1_1 : !pto.vreg<128xui16>, !pto.ptr, !pto.mask + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m0_cmg1_1, %arg2, %c64_i64_m0_cmg1_1 + nburst(%c32_i64_m0_cmg1_1, %c64_i64_m0_cmg1_1, %c64_i64_m0_cmg1_1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + + // merged from vxor_mask_edge_kernel via LaunchVxor_mask_edge_kernel + + %c0_m1_cmg1_1 = arith.constant 0 : index + %c0_i64_m1_cmg1_1 = arith.constant 0 : i64 + %c1_i64_m1_cmg1_1 = arith.constant 1 : i64 + %c32_i64_m1_cmg1_1 = arith.constant 32 : i64 + %c64_i64_m1_cmg1_1 = arith.constant 64 : i64 + %c1024_m1_cmg1_1 = arith.constant 1024 : index + %c128_m1_cmg1_1 = arith.constant 128 : index + %c2048_i64_m1_cmg1_1 = arith.constant 2048 : i64 + %c4096_i64_m1_cmg1_1 = arith.constant 4096 : i64 + + %ub_lhs_m1_cmg1_1 = pto.castptr %c0_i64_m1_cmg1_1 : i64 -> !pto.ptr + %ub_rhs_m1_cmg1_1 = pto.castptr %c2048_i64_m1_cmg1_1 : i64 -> !pto.ptr + %ub_out_m1_cmg1_1 = pto.castptr %c4096_i64_m1_cmg1_1 : i64 -> !pto.ptr + + %false_m1_cmg1_1 = arith.constant false + pto.mte_gm_ub %arg3, %ub_lhs_m1_cmg1_1, %c0_i64_m1_cmg1_1, %c64_i64_m1_cmg1_1 + nburst(%c32_i64_m1_cmg1_1, %c64_i64_m1_cmg1_1, %c64_i64_m1_cmg1_1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + pto.mte_gm_ub %arg4, %ub_rhs_m1_cmg1_1, %c0_i64_m1_cmg1_1, %c64_i64_m1_cmg1_1 + nburst(%c32_i64_m1_cmg1_1, %c64_i64_m1_cmg1_1, %c64_i64_m1_cmg1_1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %mask_m1_cmg1_1 = pto.pset_b16 "PAT_ALL" : !pto.mask + scf.for %offset_m1_cmg1_1 = %c0_m1_cmg1_1 to %c1024_m1_cmg1_1 step %c128_m1_cmg1_1 { + %lhs_m1_cmg1_1 = pto.vlds %ub_lhs_m1_cmg1_1[%offset_m1_cmg1_1] : !pto.ptr -> !pto.vreg<128xui16> + %rhs_m1_cmg1_1 = pto.vlds %ub_rhs_m1_cmg1_1[%offset_m1_cmg1_1] : !pto.ptr -> !pto.vreg<128xui16> + %out_m1_cmg1_1 = pto.vxor %lhs_m1_cmg1_1, %rhs_m1_cmg1_1, %mask_m1_cmg1_1 : !pto.vreg<128xui16>, !pto.vreg<128xui16>, !pto.mask -> !pto.vreg<128xui16> + pto.vsts %out_m1_cmg1_1, %ub_out_m1_cmg1_1[%offset_m1_cmg1_1], %mask_m1_cmg1_1 : !pto.vreg<128xui16>, !pto.ptr, !pto.mask + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m1_cmg1_1, %arg5, %c64_i64_m1_cmg1_1 + nburst(%c32_i64_m1_cmg1_1, %c64_i64_m1_cmg1_1, %c64_i64_m1_cmg1_1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + } + return } } diff --git a/test/vpto/cases/micro-op/binary-vector/vand/launch.cpp b/test/vpto/cases/micro-op/binary-vector/vand/launch.cpp index 3008d46bc..b5c59f117 100644 --- a/test/vpto/cases/micro-op/binary-vector/vand/launch.cpp +++ b/test/vpto/cases/micro-op/binary-vector/vand/launch.cpp @@ -5,17 +5,10 @@ // THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, // INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. // See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/binary-vector/vand -// family: binary-vector -// target_ops: pto.vand -// scenarios: core-i16-unsigned, full-mask -// ----------------------------------------------------------------------------- +// Merged launch wrappers #ifndef __VEC_SCOPE__ #define __VEC_SCOPE__ #endif - #if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) typedef struct { unsigned char v; } hifloat8_t; typedef struct { unsigned char v; } float8_e4m3_t; @@ -28,14 +21,8 @@ typedef struct { unsigned char v; } float4_e2m1x2_t; #if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) #include #endif - #if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; +struct MrgSortExecutedNumList { uint16_t mrgSortList0,mrgSortList1,mrgSortList2,mrgSortList3; }; #endif #ifndef __CPU_SIM #include "acl/acl.h" @@ -45,9 +32,20 @@ extern "C" __global__ [aicore] void vand_i16_unsigned_kernel(__gm__ uint16_t *v1 __gm__ uint16_t *v2, __gm__ uint16_t *v3); -void LaunchVand_i16_unsigned_kernel(uint16_t *v1, uint16_t *v2, uint16_t *v3, - void *stream) { - vand_i16_unsigned_kernel<<<1, nullptr, stream>>>((__gm__ uint16_t *)v1, - (__gm__ uint16_t *)v2, - (__gm__ uint16_t *)v3); +extern "C" __global__ [aicore] void vand_deep_merged_kernel( + __gm__ uint16_t * arg0, + __gm__ uint16_t * arg1, + __gm__ uint16_t * arg2, + __gm__ uint16_t * arg3, + __gm__ uint16_t * arg4, + __gm__ uint16_t * arg5); + +void LaunchVandDeepMerged(uint16_t * p0, uint16_t * p1, uint16_t * p2, uint16_t * p3, uint16_t * p4, uint16_t * p5, void *stream) { + vand_deep_merged_kernel<<<1, nullptr, stream>>>( + (__gm__ uint16_t *)p0, + (__gm__ uint16_t *)p1, + (__gm__ uint16_t *)p2, + (__gm__ uint16_t *)p3, + (__gm__ uint16_t *)p4, + (__gm__ uint16_t *)p5); } diff --git a/test/vpto/cases/micro-op/binary-vector/vand/main.cpp b/test/vpto/cases/micro-op/binary-vector/vand/main.cpp index 958b4422e..8a93c32d5 100644 --- a/test/vpto/cases/micro-op/binary-vector/vand/main.cpp +++ b/test/vpto/cases/micro-op/binary-vector/vand/main.cpp @@ -5,97 +5,91 @@ // THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, // INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. // See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/binary-vector/vand -// family: binary-vector -// target_ops: pto.vand -// scenarios: core-i16-unsigned, full-mask -// ----------------------------------------------------------------------------- -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -*/ +// Merged vand test case. #include "test_common.h" #include "acl/acl.h" #include #include - +#include using namespace PtoTestCommon; -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, \ - (int)_ret, __FILE__, __LINE__); \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) +#define ACL_CHECK(expr) do { const aclError _r=(expr); if(_r!=ACL_SUCCESS){std::fprintf(stderr,"[ERROR] %s:%d acle=%d\n",#expr,__LINE__,(int)_r);rc=1;goto cleanup;} }while(0) +#define FCK(expr,path) do{if(!(expr)){std::fprintf(stderr,"[ERROR] file:%s\n",path);rc=1;goto cleanup;}}while(0) + + -void LaunchVand_i16_unsigned_kernel(uint16_t *v1, uint16_t *v2, uint16_t *v3, - void *stream); +void LaunchVandDeepMerged(uint16_t * p0, uint16_t * p1, uint16_t * p2, uint16_t * p3, uint16_t * p4, uint16_t * p5, void *stream); int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(uint16_t); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(uint16_t); - size_t elemCount_v3 = 1024; - size_t fileSize_v3 = elemCount_v3 * sizeof(uint16_t); - uint16_t *v1Host = nullptr; - uint16_t *v1Device = nullptr; - uint16_t *v2Host = nullptr; - uint16_t *v2Device = nullptr; - uint16_t *v3Host = nullptr; - uint16_t *v3Device = nullptr; - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; + constexpr size_t SZ_f32 = 2048; + constexpr size_t SZ_mask_edge = 2048; - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) - deviceId = std::atoi(envDevice); - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; + uint16_t *h_f32_v1=nullptr, *d_f32_v1=nullptr; + uint16_t *h_f32_v2=nullptr, *d_f32_v2=nullptr; + uint16_t *h_f32_v3=nullptr, *d_f32_v3=nullptr; + uint16_t *h_mask_edge_v1=nullptr, *d_mask_edge_v1=nullptr; + uint16_t *h_mask_edge_v2=nullptr, *d_mask_edge_v2=nullptr; + uint16_t *h_mask_edge_v3=nullptr, *d_mask_edge_v3=nullptr; + int rc=0; bool aclInited=false,deviceSet=false; int deviceId=0; aclrtStream stream=nullptr; size_t fsz=0; + ACL_CHECK(aclInit(nullptr)); aclInited=true; + if(const char*e=std::getenv("ACL_DEVICE_ID")) deviceId=std::atoi(e); + ACL_CHECK(aclrtSetDevice(deviceId)); deviceSet=true; ACL_CHECK(aclrtCreateStream(&stream)); - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMallocHost((void **)(&v3Host), fileSize_v3)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v3Device, fileSize_v3, ACL_MEM_MALLOC_HUGE_FIRST)); - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ReadFile("./v3.bin", fileSize_v3, v3Host, fileSize_v3); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v3Device, fileSize_v3, v3Host, fileSize_v3, - ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVand_i16_unsigned_kernel(v1Device, v2Device, v3Device, stream); + ACL_CHECK(aclrtMallocHost((void**)&h_f32_v1,SZ_f32)); + ACL_CHECK(aclrtMallocHost((void**)&h_f32_v2,SZ_f32)); + ACL_CHECK(aclrtMallocHost((void**)&h_f32_v3,SZ_f32)); + ACL_CHECK(aclrtMallocHost((void**)&h_mask_edge_v1,SZ_mask_edge)); + ACL_CHECK(aclrtMallocHost((void**)&h_mask_edge_v2,SZ_mask_edge)); + ACL_CHECK(aclrtMallocHost((void**)&h_mask_edge_v3,SZ_mask_edge)); + ACL_CHECK(aclrtMalloc((void**)&d_f32_v1,SZ_f32,ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc((void**)&d_f32_v2,SZ_f32,ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc((void**)&d_f32_v3,SZ_f32,ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc((void**)&d_mask_edge_v1,SZ_mask_edge,ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc((void**)&d_mask_edge_v2,SZ_mask_edge,ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc((void**)&d_mask_edge_v3,SZ_mask_edge,ACL_MEM_MALLOC_HUGE_FIRST)); + fsz=SZ_f32; FCK(ReadFile("v1.bin",fsz,h_f32_v1,SZ_f32)&&fsz==SZ_f32,"v1.bin"); + fsz=SZ_f32; FCK(ReadFile("v2.bin",fsz,h_f32_v2,SZ_f32)&&fsz==SZ_f32,"v2.bin"); + fsz=SZ_f32; FCK(ReadFile("v3.bin",fsz,h_f32_v3,SZ_f32)&&fsz==SZ_f32,"v3.bin"); + fsz=SZ_mask_edge; FCK(ReadFile("v1_mask_edge.bin",fsz,h_mask_edge_v1,SZ_mask_edge)&&fsz==SZ_mask_edge,"v1_mask_edge.bin"); + fsz=SZ_mask_edge; FCK(ReadFile("v2_mask_edge.bin",fsz,h_mask_edge_v2,SZ_mask_edge)&&fsz==SZ_mask_edge,"v2_mask_edge.bin"); + fsz=SZ_mask_edge; FCK(ReadFile("v3_mask_edge.bin",fsz,h_mask_edge_v3,SZ_mask_edge)&&fsz==SZ_mask_edge,"v3_mask_edge.bin"); + ACL_CHECK(aclrtMemcpy(d_f32_v1,SZ_f32,h_f32_v1,SZ_f32,ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(d_f32_v2,SZ_f32,h_f32_v2,SZ_f32,ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(d_f32_v3,SZ_f32,h_f32_v3,SZ_f32,ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(d_mask_edge_v1,SZ_mask_edge,h_mask_edge_v1,SZ_mask_edge,ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(d_mask_edge_v2,SZ_mask_edge,h_mask_edge_v2,SZ_mask_edge,ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(d_mask_edge_v3,SZ_mask_edge,h_mask_edge_v3,SZ_mask_edge,ACL_MEMCPY_HOST_TO_DEVICE)); + LaunchVandDeepMerged( + d_f32_v1, + d_f32_v2, + d_f32_v3, + d_mask_edge_v1, + d_mask_edge_v2, + d_mask_edge_v3, + stream + ); ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v3Host, fileSize_v3, v3Device, fileSize_v3, - ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v3.bin", v3Host, fileSize_v3); + ACL_CHECK(aclrtMemcpy(h_f32_v3,SZ_f32,d_f32_v3,SZ_f32,ACL_MEMCPY_DEVICE_TO_HOST)); + ACL_CHECK(aclrtMemcpy(h_mask_edge_v3,SZ_mask_edge,d_mask_edge_v3,SZ_mask_edge,ACL_MEMCPY_DEVICE_TO_HOST)); + FCK(WriteFile("v3.bin",h_f32_v3,SZ_f32),"v3.bin"); + FCK(WriteFile("v3_mask_edge.bin",h_mask_edge_v3,SZ_mask_edge),"v3_mask_edge.bin"); cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFree(v3Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - aclrtFreeHost(v3Host); - if (stream != nullptr) - aclrtDestroyStream(stream); - if (deviceSet) - aclrtResetDevice(deviceId); - if (aclInited) - aclFinalize(); + aclrtFree(d_f32_v1); + aclrtFree(d_f32_v2); + aclrtFree(d_f32_v3); + aclrtFree(d_mask_edge_v1); + aclrtFree(d_mask_edge_v2); + aclrtFree(d_mask_edge_v3); + aclrtFreeHost(h_f32_v1); + aclrtFreeHost(h_f32_v2); + aclrtFreeHost(h_f32_v3); + aclrtFreeHost(h_mask_edge_v1); + aclrtFreeHost(h_mask_edge_v2); + aclrtFreeHost(h_mask_edge_v3); + if(stream) aclrtDestroyStream(stream); + if(deviceSet) aclrtResetDevice(deviceId); + if(aclInited) aclFinalize(); return rc; } diff --git a/test/vpto/cases/micro-op/binary-vector/vdiv-f16/compare.py b/test/vpto/cases/micro-op/binary-vector/vdiv-f16/compare.py deleted file mode 100755 index 1de4f17b7..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vdiv-f16/compare.py +++ /dev/null @@ -1,42 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/binary-vector/vdiv-f16 -# family: binary-vector -# target_ops: pto.vdiv -# scenarios: core-f16, full-mask -import os -import sys -import numpy as np - - -def compare_bin_prefix(golden_path, output_path, dtype, eps, count): - if not os.path.exists(golden_path) or not os.path.exists(output_path): - return False - golden = np.fromfile(golden_path, dtype=dtype, count=count) - output = np.fromfile(output_path, dtype=dtype, count=count) - return golden.shape == output.shape and np.allclose( - golden, output, atol=eps, rtol=eps, equal_nan=True - ) - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_bin_prefix("golden_v3.bin", "v3.bin", np.float16, 5e-3, 1000) - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/binary-vector/vdiv-f16/golden.py b/test/vpto/cases/micro-op/binary-vector/vdiv-f16/golden.py deleted file mode 100755 index 627221d7a..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vdiv-f16/golden.py +++ /dev/null @@ -1,56 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/binary-vector/vdiv-f16 -# family: binary-vector -# target_ops: pto.vdiv -# scenarios: core-f16, full-mask -import argparse -from pathlib import Path - -import numpy as np - - -ROWS = 32 -COLS = 32 -SEED = 19 -LOGICAL_ELEMS = 1000 - - -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) - v1 = rng.uniform(-8.0, 8.0, size=(ROWS, COLS)).astype(np.float16) - v2_mag = rng.uniform(0.5, 4.0, size=(ROWS, COLS)).astype(np.float32) - v2_sign = np.where(rng.integers(0, 2, size=(ROWS, COLS), dtype=np.int32) == 0, - np.float32(-1.0), np.float32(1.0)) - v2 = (v2_mag * v2_sign).astype(np.float16) - v3 = np.zeros((ROWS, COLS), dtype=np.float16) - golden_v3 = np.zeros((ROWS, COLS), dtype=np.float16) - golden_v3.reshape(-1)[:LOGICAL_ELEMS] = ( - v1.reshape(-1)[:LOGICAL_ELEMS].astype(np.float32) - / v2.reshape(-1)[:LOGICAL_ELEMS].astype(np.float32) - ).astype(np.float16) - - output_dir.mkdir(parents=True, exist_ok=True) - v1.reshape(-1).tofile(output_dir / "v1.bin") - v2.reshape(-1).tofile(output_dir / "v2.bin") - v3.reshape(-1).tofile(output_dir / "v3.bin") - golden_v3.reshape(-1).tofile(output_dir / "golden_v3.bin") - - -def main() -> None: - parser = argparse.ArgumentParser() - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/binary-vector/vdiv-f16/kernel.pto b/test/vpto/cases/micro-op/binary-vector/vdiv-f16/kernel.pto deleted file mode 100644 index ffca6d1e3..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vdiv-f16/kernel.pto +++ /dev/null @@ -1,55 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/binary-vector/vdiv-f16 -// family: binary-vector -// target_ops: pto.vdiv -// scenarios: core-f16, full-mask -// ----------------------------------------------------------------------------- -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vdiv_f16_kernel(%arg0: !pto.ptr, %arg1: !pto.ptr, - %arg2: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c128 = arith.constant 128 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c64_i64 = arith.constant 64 : i64 - %c2048_i64 = arith.constant 2048 : i64 - %c4096_i64 = arith.constant 4096 : i64 - %c1024_i32 = arith.constant 1024 : i32 - - %ub_lhs = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_rhs = pto.castptr %c2048_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c4096_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_lhs, %c0_i64, %c64_i64 - nburst(%c32_i64, %c64_i64, %c64_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - pto.mte_gm_ub %arg1, %ub_rhs, %c0_i64, %c64_i64 - nburst(%c32_i64, %c64_i64, %c64_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %_:1 = scf.for %offset = %c0 to %c1024 step %c128 iter_args(%remaining = %c1024_i32) -> (i32) { - %mask, %next_remaining = pto.plt_b16 %remaining : i32 -> !pto.mask, i32 - %lhs = pto.vlds %ub_lhs[%offset] : !pto.ptr -> !pto.vreg<128xf16> - %rhs = pto.vlds %ub_rhs[%offset] : !pto.ptr -> !pto.vreg<128xf16> - %quot = pto.vdiv %lhs, %rhs, %mask : !pto.vreg<128xf16>, !pto.vreg<128xf16>, !pto.mask -> !pto.vreg<128xf16> - pto.vsts %quot, %ub_out[%offset], %mask : !pto.vreg<128xf16>, !pto.ptr, !pto.mask - scf.yield %next_remaining : i32 - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg2, %c64_i64 - nburst(%c32_i64, %c64_i64, %c64_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/binary-vector/vdiv-f16/launch.cpp b/test/vpto/cases/micro-op/binary-vector/vdiv-f16/launch.cpp deleted file mode 100644 index 1abdc6f6d..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vdiv-f16/launch.cpp +++ /dev/null @@ -1,52 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/binary-vector/vdiv-f16 -// family: binary-vector -// target_ops: pto.vdiv -// scenarios: core-f16, full-mask -// ----------------------------------------------------------------------------- -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vdiv_f16_kernel(__gm__ half *v1, - __gm__ half *v2, - __gm__ half *v3); - -void LaunchVdiv_f16_kernel(uint16_t *v1, uint16_t *v2, uint16_t *v3, - void *stream) { - vdiv_f16_kernel<<<1, nullptr, stream>>>((__gm__ half *)v1, (__gm__ half *)v2, - (__gm__ half *)v3); -} diff --git a/test/vpto/cases/micro-op/binary-vector/vdiv-f16/main.cpp b/test/vpto/cases/micro-op/binary-vector/vdiv-f16/main.cpp deleted file mode 100644 index d0b9cff9a..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vdiv-f16/main.cpp +++ /dev/null @@ -1,101 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/binary-vector/vdiv-f16 -// family: binary-vector -// target_ops: pto.vdiv -// scenarios: core-f16, full-mask -// ----------------------------------------------------------------------------- -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include - -using namespace PtoTestCommon; - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, \ - (int)_ret, __FILE__, __LINE__); \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVdiv_f16_kernel(uint16_t *v1, uint16_t *v2, uint16_t *v3, - void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(uint16_t); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(uint16_t); - size_t elemCount_v3 = 1024; - size_t fileSize_v3 = elemCount_v3 * sizeof(uint16_t); - uint16_t *v1Host = nullptr; - uint16_t *v1Device = nullptr; - uint16_t *v2Host = nullptr; - uint16_t *v2Device = nullptr; - uint16_t *v3Host = nullptr; - uint16_t *v3Device = nullptr; - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) - deviceId = std::atoi(envDevice); - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMallocHost((void **)(&v3Host), fileSize_v3)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v3Device, fileSize_v3, ACL_MEM_MALLOC_HUGE_FIRST)); - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ReadFile("./v3.bin", fileSize_v3, v3Host, fileSize_v3); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v3Device, fileSize_v3, v3Host, fileSize_v3, - ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVdiv_f16_kernel(v1Device, v2Device, v3Device, stream); - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v3Host, fileSize_v3, v3Device, fileSize_v3, - ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v3.bin", v3Host, fileSize_v3); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFree(v3Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - aclrtFreeHost(v3Host); - if (stream != nullptr) - aclrtDestroyStream(stream); - if (deviceSet) - aclrtResetDevice(deviceId); - if (aclInited) - aclFinalize(); - return rc; -} diff --git a/test/vpto/cases/micro-op/binary-vector/vdiv-f32-exceptional/compare.py b/test/vpto/cases/micro-op/binary-vector/vdiv-f32-exceptional/compare.py deleted file mode 100644 index a5f14dabc..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vdiv-f32-exceptional/compare.py +++ /dev/null @@ -1,37 +0,0 @@ -#!/usr/bin/python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - - -import os -import sys -import numpy as np - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(golden_path) or not os.path.exists(output_path): - return False - golden = np.fromfile(golden_path, dtype=dtype) - output = np.fromfile(output_path, dtype=dtype) - return golden.shape == output.shape and np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True) - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_bin("golden_v3.bin", "v3.bin", np.float32, 1e-4) - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/binary-vector/vdiv-f32-exceptional/golden.py b/test/vpto/cases/micro-op/binary-vector/vdiv-f32-exceptional/golden.py deleted file mode 100644 index 9caa514c7..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vdiv-f32-exceptional/golden.py +++ /dev/null @@ -1,54 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -ROWS = 32 -COLS = 32 -SEED = 19 - - -def generate(output_dir: Path, seed: int) -> None: - del seed - numer = np.array( - [-np.inf, -7.5, -0.0, 0.0, 1.0, np.inf, np.nan, 3.5], - dtype=np.float32, - ) - denom = np.array( - [2.0, -2.0, 0.0, -0.0, np.inf, 1.0, 1.0, np.nan], - dtype=np.float32, - ) - v1 = np.resize(numer, ROWS * COLS).reshape(ROWS, COLS).astype(np.float32) - v2 = np.resize(denom, ROWS * COLS).reshape(ROWS, COLS).astype(np.float32) - v3 = np.zeros((ROWS, COLS), dtype=np.float32) - golden_v3 = np.divide(v1, v2).astype(np.float32, copy=False) - - output_dir.mkdir(parents=True, exist_ok=True) - v1.reshape(-1).tofile(output_dir / "v1.bin") - v2.reshape(-1).tofile(output_dir / "v2.bin") - v3.reshape(-1).tofile(output_dir / "v3.bin") - golden_v3.reshape(-1).tofile(output_dir / "golden_v3.bin") - - -def main() -> None: - parser = argparse.ArgumentParser() - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/binary-vector/vdiv-f32-exceptional/kernel.pto b/test/vpto/cases/micro-op/binary-vector/vdiv-f32-exceptional/kernel.pto deleted file mode 100644 index ddb95efd3..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vdiv-f32-exceptional/kernel.pto +++ /dev/null @@ -1,49 +0,0 @@ -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vdiv_f32_exceptional_kernel_2d(%arg0: !pto.ptr, %arg1: !pto.ptr, - %arg2: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c64 = arith.constant 64 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c128_i64 = arith.constant 128 : i64 - %c4096_i64 = arith.constant 4096 : i64 - %c8192_i64 = arith.constant 8192 : i64 - %c1024_i32 = arith.constant 1024 : i32 - - %ub_lhs = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_rhs = pto.castptr %c4096_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c8192_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_lhs, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - pto.mte_gm_ub %arg1, %ub_rhs, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %_:1 = scf.for %offset = %c0 to %c1024 step %c64 iter_args(%remaining = %c1024_i32) -> (i32) { - %mask, %next_remaining = pto.plt_b32 %remaining : i32 -> !pto.mask, i32 - %lhs = pto.vlds %ub_lhs[%offset] : !pto.ptr -> !pto.vreg<64xf32> - %rhs = pto.vlds %ub_rhs[%offset] : !pto.ptr -> !pto.vreg<64xf32> - %quot = pto.vdiv %lhs, %rhs, %mask : !pto.vreg<64xf32>, !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> - pto.vsts %quot, %ub_out[%offset], %mask : !pto.vreg<64xf32>, !pto.ptr, !pto.mask - scf.yield %next_remaining : i32 - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg2, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/binary-vector/vdiv-f32-exceptional/launch.cpp b/test/vpto/cases/micro-op/binary-vector/vdiv-f32-exceptional/launch.cpp deleted file mode 100644 index a68d4e95b..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vdiv-f32-exceptional/launch.cpp +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vdiv_f32_exceptional_kernel_2d(__gm__ float *v1, - __gm__ float *v2, - __gm__ float *v3); - -void LaunchVdiv_f32_exceptional_kernel_2d(float *v1, float *v2, float *v3, void *stream) { - vdiv_f32_exceptional_kernel_2d<<<1, nullptr, stream>>>((__gm__ float *)v1, - (__gm__ float *)v2, - (__gm__ float *)v3); -} diff --git a/test/vpto/cases/micro-op/binary-vector/vdiv-f32-exceptional/main.cpp b/test/vpto/cases/micro-op/binary-vector/vdiv-f32-exceptional/main.cpp deleted file mode 100644 index d048e2faf..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vdiv-f32-exceptional/main.cpp +++ /dev/null @@ -1,95 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include - -using namespace PtoTestCommon; - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, \ - (int)_ret, __FILE__, __LINE__); \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVdiv_f32_exceptional_kernel_2d(float *v1, float *v2, float *v3, - void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(float); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(float); - size_t elemCount_v3 = 1024; - size_t fileSize_v3 = elemCount_v3 * sizeof(float); - float *v1Host = nullptr; - float *v1Device = nullptr; - float *v2Host = nullptr; - float *v2Device = nullptr; - float *v3Host = nullptr; - float *v3Device = nullptr; - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) - deviceId = std::atoi(envDevice); - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMallocHost((void **)(&v3Host), fileSize_v3)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v3Device, fileSize_v3, ACL_MEM_MALLOC_HUGE_FIRST)); - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ReadFile("./v3.bin", fileSize_v3, v3Host, fileSize_v3); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v3Device, fileSize_v3, v3Host, fileSize_v3, - ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVdiv_f32_exceptional_kernel_2d(v1Device, v2Device, v3Device, stream); - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v3Host, fileSize_v3, v3Device, fileSize_v3, - ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v3.bin", v3Host, fileSize_v3); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFree(v3Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - aclrtFreeHost(v3Host); - if (stream != nullptr) - aclrtDestroyStream(stream); - if (deviceSet) - aclrtResetDevice(deviceId); - if (aclInited) - aclFinalize(); - return rc; -} diff --git a/test/vpto/cases/micro-op/binary-vector/vdiv-tail/compare.py b/test/vpto/cases/micro-op/binary-vector/vdiv-tail/compare.py deleted file mode 100644 index c95419953..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vdiv-tail/compare.py +++ /dev/null @@ -1,39 +0,0 @@ -#!/usr/bin/python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - - -import os -import sys -import numpy as np - - -def compare_bin_prefix(golden_path, output_path, dtype, eps, count): - if not os.path.exists(golden_path) or not os.path.exists(output_path): - return False - golden = np.fromfile(golden_path, dtype=dtype, count=count) - output = np.fromfile(output_path, dtype=dtype, count=count) - return golden.shape == output.shape and np.allclose( - golden, output, atol=eps, rtol=eps, equal_nan=True - ) - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_bin_prefix("golden_v3.bin", "v3.bin", np.float32, 1e-4, 1000) - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/binary-vector/vdiv-tail/golden.py b/test/vpto/cases/micro-op/binary-vector/vdiv-tail/golden.py deleted file mode 100644 index c010ada1f..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vdiv-tail/golden.py +++ /dev/null @@ -1,51 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -ROWS = 32 -COLS = 32 -SEED = 19 -LOGICAL_ELEMS = 1000 -OUT_SENTINEL = np.float32(-123.25) - - -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) - v1 = rng.random((ROWS, COLS), dtype=np.float32) + np.float32(0.5) - v2 = rng.random((ROWS, COLS), dtype=np.float32) + np.float32(0.5) - v3 = np.full((ROWS, COLS), OUT_SENTINEL, dtype=np.float32) - golden_v3 = np.full((ROWS, COLS), OUT_SENTINEL, dtype=np.float32) - golden_v3.reshape(-1)[:LOGICAL_ELEMS] = ( - v1.reshape(-1)[:LOGICAL_ELEMS] / v2.reshape(-1)[:LOGICAL_ELEMS] - ).astype(np.float32, copy=False) - - output_dir.mkdir(parents=True, exist_ok=True) - v1.reshape(-1).tofile(output_dir / "v1.bin") - v2.reshape(-1).tofile(output_dir / "v2.bin") - v3.reshape(-1).tofile(output_dir / "v3.bin") - golden_v3.reshape(-1).tofile(output_dir / "golden_v3.bin") - - -def main() -> None: - parser = argparse.ArgumentParser() - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/binary-vector/vdiv-tail/kernel.pto b/test/vpto/cases/micro-op/binary-vector/vdiv-tail/kernel.pto deleted file mode 100644 index 774b7f31f..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vdiv-tail/kernel.pto +++ /dev/null @@ -1,49 +0,0 @@ -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vdiv_tail_kernel_2d(%arg0: !pto.ptr, %arg1: !pto.ptr, - %arg2: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c64 = arith.constant 64 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c128_i64 = arith.constant 128 : i64 - %c4096_i64 = arith.constant 4096 : i64 - %c8192_i64 = arith.constant 8192 : i64 - %c1000_i32 = arith.constant 1000 : i32 - - %ub_lhs = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_rhs = pto.castptr %c4096_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c8192_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_lhs, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - pto.mte_gm_ub %arg1, %ub_rhs, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %_:1 = scf.for %offset = %c0 to %c1024 step %c64 iter_args(%remaining = %c1000_i32) -> (i32) { - %mask, %next_remaining = pto.plt_b32 %remaining : i32 -> !pto.mask, i32 - %lhs = pto.vlds %ub_lhs[%offset] : !pto.ptr -> !pto.vreg<64xf32> - %rhs = pto.vlds %ub_rhs[%offset] : !pto.ptr -> !pto.vreg<64xf32> - %quot = pto.vdiv %lhs, %rhs, %mask : !pto.vreg<64xf32>, !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> - pto.vsts %quot, %ub_out[%offset], %mask : !pto.vreg<64xf32>, !pto.ptr, !pto.mask - scf.yield %next_remaining : i32 - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg2, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/binary-vector/vdiv-tail/launch.cpp b/test/vpto/cases/micro-op/binary-vector/vdiv-tail/launch.cpp deleted file mode 100644 index 85826b5f8..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vdiv-tail/launch.cpp +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vdiv_tail_kernel_2d(__gm__ float *v1, - __gm__ float *v2, - __gm__ float *v3); - -void LaunchVdiv_tail_kernel_2d(float *v1, float *v2, float *v3, void *stream) { - vdiv_tail_kernel_2d<<<1, nullptr, stream>>>((__gm__ float *)v1, - (__gm__ float *)v2, - (__gm__ float *)v3); -} diff --git a/test/vpto/cases/micro-op/binary-vector/vdiv-tail/main.cpp b/test/vpto/cases/micro-op/binary-vector/vdiv-tail/main.cpp deleted file mode 100644 index 3f3dcf515..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vdiv-tail/main.cpp +++ /dev/null @@ -1,94 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include - -using namespace PtoTestCommon; - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, \ - (int)_ret, __FILE__, __LINE__); \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVdiv_tail_kernel_2d(float *v1, float *v2, float *v3, void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(float); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(float); - size_t elemCount_v3 = 1024; - size_t fileSize_v3 = elemCount_v3 * sizeof(float); - float *v1Host = nullptr; - float *v1Device = nullptr; - float *v2Host = nullptr; - float *v2Device = nullptr; - float *v3Host = nullptr; - float *v3Device = nullptr; - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) - deviceId = std::atoi(envDevice); - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMallocHost((void **)(&v3Host), fileSize_v3)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v3Device, fileSize_v3, ACL_MEM_MALLOC_HUGE_FIRST)); - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ReadFile("./v3.bin", fileSize_v3, v3Host, fileSize_v3); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v3Device, fileSize_v3, v3Host, fileSize_v3, - ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVdiv_tail_kernel_2d(v1Device, v2Device, v3Device, stream); - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v3Host, fileSize_v3, v3Device, fileSize_v3, - ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v3.bin", v3Host, fileSize_v3); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFree(v3Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - aclrtFreeHost(v3Host); - if (stream != nullptr) - aclrtDestroyStream(stream); - if (deviceSet) - aclrtResetDevice(deviceId); - if (aclInited) - aclFinalize(); - return rc; -} diff --git a/test/vpto/cases/micro-op/binary-vector/vdiv/compare.py b/test/vpto/cases/micro-op/binary-vector/vdiv/compare.py index a5f14dabc..86a92e12c 100644 --- a/test/vpto/cases/micro-op/binary-vector/vdiv/compare.py +++ b/test/vpto/cases/micro-op/binary-vector/vdiv/compare.py @@ -7,31 +7,55 @@ # INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. # See LICENSE in the root of the software repository for the full text of the License. - -import os -import sys +# Merged vdiv compare. +import os, sys import numpy as np - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(golden_path) or not os.path.exists(output_path): +def _cmp(golden, output, dtype, eps, count=-1): + if not os.path.exists(golden) or not os.path.exists(output): return False - golden = np.fromfile(golden_path, dtype=dtype) - output = np.fromfile(output_path, dtype=dtype) - return golden.shape == output.shape and np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True) + kw = {} if count < 0 else {"count": count} + g = np.fromfile(golden, dtype=dtype, **kw) + o = np.fromfile(output, dtype=dtype, **kw) + return g.shape == o.shape and np.allclose(g, o, atol=eps, rtol=eps, equal_nan=True) +def _cmpeq(golden, output, dtype): + if not os.path.exists(golden) or not os.path.exists(output): + return False + g = np.fromfile(golden, dtype=dtype) + o = np.fromfile(output, dtype=dtype) + return g.shape == o.shape and np.array_equal(g, o) def main(): strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_bin("golden_v3.bin", "v3.bin", np.float32, 1e-4) - if not ok: + failed = [] + if not (_cmp("golden_v3.bin","v3.bin",np.float32,1e-4,-1)): + failed.append('f32') + print('[ERROR] compare failed: f32') + else: + print('[INFO] f32: passed') + if not (_cmp("golden_v3_f16.bin","v3_f16.bin",np.float16,5e-3,1024)): + failed.append('f16') + print('[ERROR] compare failed: f16') + else: + print('[INFO] f16: passed') + if not (_cmp("golden_v3_f32_exceptional.bin","v3_f32_exceptional.bin",np.float32,1e-4,-1)): + failed.append('f32_exceptional') + print('[ERROR] compare failed: f32_exceptional') + else: + print('[INFO] f32_exceptional: passed') + if not (_cmp("golden_v3_tail.bin","v3_tail.bin",np.float32,1e-4,1000)): + failed.append('tail') + print('[ERROR] compare failed: tail') + else: + print('[INFO] tail: passed') + if failed: if strict: - print("[ERROR] compare failed") + print(f"[ERROR] {len(failed)} variant(s) failed: {','.join(failed)}") sys.exit(2) - print("[WARN] compare failed (non-gating)") + print(f"[WARN] {len(failed)} variant(s) failed (non-gating): {','.join(failed)}") return - print("[INFO] compare passed") - + print("[INFO] compare passed (all 4 variants)") if __name__ == "__main__": main() diff --git a/test/vpto/cases/micro-op/binary-vector/vdiv/golden.py b/test/vpto/cases/micro-op/binary-vector/vdiv/golden.py index ea43aa613..6b485b897 100644 --- a/test/vpto/cases/micro-op/binary-vector/vdiv/golden.py +++ b/test/vpto/cases/micro-op/binary-vector/vdiv/golden.py @@ -7,43 +7,85 @@ # INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. # See LICENSE in the root of the software repository for the full text of the License. -# coding=utf-8 - +# Merged vdiv golden data generator. import argparse from pathlib import Path - import numpy as np - ROWS = 32 COLS = 32 SEED = 19 +LOGICAL_ELEMS = 1000 +OUT_SENTINEL = np.float32(-123.25) +def f32_to_bf16_bits(values): + wide = values.astype(np.float32, copy=False).view(np.uint32) + rounding = np.uint32(0x7FFF) + ((wide >> 16) & np.uint32(1)) + return ((wide + rounding) >> 16).astype(np.uint16) -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) +def bf16_bits_to_f32(bits): + return (bits.astype(np.uint32) << 16).view(np.float32) + +def gen_f32(out, rng): v1 = rng.uniform(-8.0, 8.0, size=(ROWS, COLS)).astype(np.float32) - v2_mag = rng.uniform(0.5, 8.0, size=(ROWS, COLS)).astype(np.float32) - v2_sign = np.where(rng.integers(0, 2, size=(ROWS, COLS), dtype=np.int32) == 0, - np.float32(-1.0), np.float32(1.0)) - v2 = (v2_mag * v2_sign).astype(np.float32, copy=False) - golden_v3 = (v1 / v2).astype(np.float32, copy=False) + v2 = rng.uniform(-8.0, 8.0, size=(ROWS, COLS)).astype(np.float32) + g = (np.divide(v1, v2, where=v2!=0, out=np.full_like(v1, np.nan))).astype(np.float32, copy=False) v3 = np.zeros((ROWS, COLS), dtype=np.float32) + v1.reshape(-1).tofile(out / "v1.bin") + v2.reshape(-1).tofile(out / "v2.bin") + v3.reshape(-1).tofile(out / "v3.bin") + g.reshape(-1).tofile(out / "golden_v3.bin") + +def gen_f16(out, rng): + v1 = rng.uniform(-8.0, 8.0, size=(ROWS, COLS)).astype(np.float16) + v2 = rng.uniform(-8.0, 8.0, size=(ROWS, COLS)).astype(np.float16) + g = ((v1.astype(np.float32) / np.maximum(np.abs(v2.astype(np.float32)), 1e-8) * np.sign(v2.astype(np.float32))).astype(np.float16)).astype(np.float16) + v3 = np.zeros((ROWS, COLS), dtype=np.float16) + v1.reshape(-1).tofile(out / "v1_f16.bin") + v2.reshape(-1).tofile(out / "v2_f16.bin") + v3.reshape(-1).tofile(out / "v3_f16.bin") + g.reshape(-1).tofile(out / "golden_v3_f16.bin") - output_dir.mkdir(parents=True, exist_ok=True) - v1.reshape(-1).tofile(output_dir / "v1.bin") - v2.reshape(-1).tofile(output_dir / "v2.bin") - v3.reshape(-1).tofile(output_dir / "v3.bin") - golden_v3.reshape(-1).tofile(output_dir / "golden_v3.bin") +def gen_f32_exceptional(out, rng): + specials_a = np.array([-np.inf, -7.5, -0.0, 0.0, 1.0, np.inf, np.nan, 3.5], dtype=np.float32) + specials_b = np.array([np.inf, 2.5, 0.0, -0.0, -1.0, -np.inf, 1.0, np.nan], dtype=np.float32) + v1 = np.resize(specials_a, ROWS * COLS).reshape(ROWS, COLS).astype(np.float32) + v2 = np.resize(specials_b, ROWS * COLS).reshape(ROWS, COLS).astype(np.float32) + g = (np.divide(v1, v2, where=v2!=0, out=np.full_like(v1, np.nan))).astype(np.float32, copy=False) + v3 = np.zeros((ROWS, COLS), dtype=np.float32) + v1.reshape(-1).tofile(out / "v1_f32_exceptional.bin") + v2.reshape(-1).tofile(out / "v2_f32_exceptional.bin") + v3.reshape(-1).tofile(out / "v3_f32_exceptional.bin") + g.reshape(-1).tofile(out / "golden_v3_f32_exceptional.bin") +def gen_tail(out, rng): + v1 = rng.random((ROWS, COLS), dtype=np.float32) + v2 = rng.random((ROWS, COLS), dtype=np.float32) + v3 = np.full((ROWS, COLS), OUT_SENTINEL, dtype=np.float32) + g = np.full((ROWS, COLS), OUT_SENTINEL, dtype=np.float32) + g.reshape(-1)[:LOGICAL_ELEMS] = (np.divide(v1.reshape(-1)[:LOGICAL_ELEMS], v2.reshape(-1)[:LOGICAL_ELEMS], where=v2.reshape(-1)[:LOGICAL_ELEMS]!=0, out=np.full(LOGICAL_ELEMS, np.nan))).astype(np.float32, copy=False) + v1.reshape(-1).tofile(out / "v1_tail.bin") + v2.reshape(-1).tofile(out / "v2_tail.bin") + v3.reshape(-1).tofile(out / "v3_tail.bin") + g.reshape(-1).tofile(out / "golden_v3_tail.bin") -def main() -> None: - parser = argparse.ArgumentParser() - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) +GENERATORS = [ + gen_f32, + gen_f16, + gen_f32_exceptional, + gen_tail, +] +def main(): + p = argparse.ArgumentParser() + p.add_argument("--output-dir", type=Path, default=Path(".")) + p.add_argument("--seed", type=int, default=SEED) + a = p.parse_args() + rng = np.random.default_rng(a.seed) + out = a.output_dir + out.mkdir(parents=True, exist_ok=True) + for gen in GENERATORS: + gen(out, rng) if __name__ == "__main__": main() diff --git a/test/vpto/cases/micro-op/binary-vector/vdiv/kernel.pto b/test/vpto/cases/micro-op/binary-vector/vdiv/kernel.pto index 2b6849a8b..9246b182c 100644 --- a/test/vpto/cases/micro-op/binary-vector/vdiv/kernel.pto +++ b/test/vpto/cases/micro-op/binary-vector/vdiv/kernel.pto @@ -1,50 +1,190 @@ module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @div_kernel_2d(%arg0: !pto.ptr, %arg1: !pto.ptr, - %arg2: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c64 = arith.constant 64 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c128_i64 = arith.constant 128 : i64 - %c4096_i64 = arith.constant 4096 : i64 - %c8192_i64 = arith.constant 8192 : i64 - %c1024_i32 = arith.constant 1024 : i32 - - %ub_lhs = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_rhs = pto.castptr %c4096_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c8192_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_lhs, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) + func.func @vdiv_deep_merged_kernel(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: !pto.ptr, %arg4: !pto.ptr, %arg5: !pto.ptr, %arg6: !pto.ptr, %arg7: !pto.ptr, %arg8: !pto.ptr, %arg9: !pto.ptr, %arg10: !pto.ptr, %arg11: !pto.ptr) attributes {pto.kernel} { + // merged from div_kernel_2d via LaunchDiv_kernel_2d + + %c0_m0 = arith.constant 0 : index + %c1_m0 = arith.constant 1 : index + %c64_m0 = arith.constant 64 : index + %c1024_m0 = arith.constant 1024 : index + %c0_i64_m0 = arith.constant 0 : i64 + %c1_i64_m0 = arith.constant 1 : i64 + %c32_i64_m0 = arith.constant 32 : i64 + %c128_i64_m0 = arith.constant 128 : i64 + %c4096_i64_m0 = arith.constant 4096 : i64 + %c8192_i64_m0 = arith.constant 8192 : i64 + %c1024_i32_m0 = arith.constant 1024 : i32 + + %ub_lhs_m0 = pto.castptr %c0_i64_m0 : i64 -> !pto.ptr + %ub_rhs_m0 = pto.castptr %c4096_i64_m0 : i64 -> !pto.ptr + %ub_out_m0 = pto.castptr %c8192_i64_m0 : i64 -> !pto.ptr + + %false_m0 = arith.constant false + pto.mte_gm_ub %arg0, %ub_lhs_m0, %c0_i64_m0, %c128_i64_m0 + nburst(%c32_i64_m0, %c128_i64_m0, %c128_i64_m0) : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - pto.mte_gm_ub %arg1, %ub_rhs, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) + pto.mte_gm_ub %arg1, %ub_rhs_m0, %c0_i64_m0, %c128_i64_m0 + nburst(%c32_i64_m0, %c128_i64_m0, %c128_i64_m0) : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] pto.vecscope { - %_:1 = scf.for %offset = %c0 to %c1024 step %c64 iter_args(%remaining = %c1024_i32) -> (i32) { - %mask, %next_remaining = pto.plt_b32 %remaining : i32 -> !pto.mask, i32 - %lhs = pto.vlds %ub_lhs[%offset] : !pto.ptr -> !pto.vreg<64xf32> - %rhs = pto.vlds %ub_rhs[%offset] : !pto.ptr -> !pto.vreg<64xf32> - %quot = pto.vdiv %lhs, %rhs, %mask : !pto.vreg<64xf32>, !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> - pto.vsts %quot, %ub_out[%offset], %mask : !pto.vreg<64xf32>, !pto.ptr, !pto.mask - scf.yield %next_remaining : i32 + %__m0:1 = scf.for %offset_m0 = %c0_m0 to %c1024_m0 step %c64_m0 iter_args(%remaining_m0 = %c1024_i32_m0) -> (i32) { + %mask_m0, %next_remaining_m0 = pto.plt_b32 %remaining_m0 : i32 -> !pto.mask, i32 + %lhs_m0 = pto.vlds %ub_lhs_m0[%offset_m0] : !pto.ptr -> !pto.vreg<64xf32> + %rhs_m0 = pto.vlds %ub_rhs_m0[%offset_m0] : !pto.ptr -> !pto.vreg<64xf32> + %quot_m0 = pto.vdiv %lhs_m0, %rhs_m0, %mask_m0 : !pto.vreg<64xf32>, !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> + pto.vsts %quot_m0, %ub_out_m0[%offset_m0], %mask_m0 : !pto.vreg<64xf32>, !pto.ptr, !pto.mask + scf.yield %next_remaining_m0 : i32 } } pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg2, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) + pto.mte_ub_gm %ub_out_m0, %arg2, %c128_i64_m0 + nburst(%c32_i64_m0, %c128_i64_m0, %c128_i64_m0) : !pto.ptr, !pto.ptr, i64, i64, i64, i64 pto.barrier #pto.pipe + + // merged from vdiv_f16_kernel via LaunchVdiv_f16_kernel + + %c0_m1 = arith.constant 0 : index + %c128_m1 = arith.constant 128 : index + %c1024_m1 = arith.constant 1024 : index + %c0_i64_m1 = arith.constant 0 : i64 + %c1_i64_m1 = arith.constant 1 : i64 + %c32_i64_m1 = arith.constant 32 : i64 + %c64_i64_m1 = arith.constant 64 : i64 + %c2048_i64_m1 = arith.constant 2048 : i64 + %c4096_i64_m1 = arith.constant 4096 : i64 + %c1024_i32_m1 = arith.constant 1024 : i32 + + %ub_lhs_m1 = pto.castptr %c0_i64_m1 : i64 -> !pto.ptr + %ub_rhs_m1 = pto.castptr %c2048_i64_m1 : i64 -> !pto.ptr + %ub_out_m1 = pto.castptr %c4096_i64_m1 : i64 -> !pto.ptr + + %false_m1 = arith.constant false + pto.mte_gm_ub %arg3, %ub_lhs_m1, %c0_i64_m1, %c64_i64_m1 + nburst(%c32_i64_m1, %c64_i64_m1, %c64_i64_m1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + pto.mte_gm_ub %arg4, %ub_rhs_m1, %c0_i64_m1, %c64_i64_m1 + nburst(%c32_i64_m1, %c64_i64_m1, %c64_i64_m1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %__m1:1 = scf.for %offset_m1 = %c0_m1 to %c1024_m1 step %c128_m1 iter_args(%remaining_m1 = %c1024_i32_m1) -> (i32) { + %mask_m1, %next_remaining_m1 = pto.plt_b16 %remaining_m1 : i32 -> !pto.mask, i32 + %lhs_m1 = pto.vlds %ub_lhs_m1[%offset_m1] : !pto.ptr -> !pto.vreg<128xf16> + %rhs_m1 = pto.vlds %ub_rhs_m1[%offset_m1] : !pto.ptr -> !pto.vreg<128xf16> + %quot_m1 = pto.vdiv %lhs_m1, %rhs_m1, %mask_m1 : !pto.vreg<128xf16>, !pto.vreg<128xf16>, !pto.mask -> !pto.vreg<128xf16> + pto.vsts %quot_m1, %ub_out_m1[%offset_m1], %mask_m1 : !pto.vreg<128xf16>, !pto.ptr, !pto.mask + scf.yield %next_remaining_m1 : i32 + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m1, %arg5, %c64_i64_m1 + nburst(%c32_i64_m1, %c64_i64_m1, %c64_i64_m1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + + // merged from vdiv_f32_exceptional_kernel_2d via LaunchVdiv_f32_exceptional_kernel_2d + + %c0_m2 = arith.constant 0 : index + %c64_m2 = arith.constant 64 : index + %c1024_m2 = arith.constant 1024 : index + %c0_i64_m2 = arith.constant 0 : i64 + %c1_i64_m2 = arith.constant 1 : i64 + %c32_i64_m2 = arith.constant 32 : i64 + %c128_i64_m2 = arith.constant 128 : i64 + %c4096_i64_m2 = arith.constant 4096 : i64 + %c8192_i64_m2 = arith.constant 8192 : i64 + %c1024_i32_m2 = arith.constant 1024 : i32 + + %ub_lhs_m2 = pto.castptr %c0_i64_m2 : i64 -> !pto.ptr + %ub_rhs_m2 = pto.castptr %c4096_i64_m2 : i64 -> !pto.ptr + %ub_out_m2 = pto.castptr %c8192_i64_m2 : i64 -> !pto.ptr + + %false_m2 = arith.constant false + pto.mte_gm_ub %arg6, %ub_lhs_m2, %c0_i64_m2, %c128_i64_m2 + nburst(%c32_i64_m2, %c128_i64_m2, %c128_i64_m2) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + pto.mte_gm_ub %arg7, %ub_rhs_m2, %c0_i64_m2, %c128_i64_m2 + nburst(%c32_i64_m2, %c128_i64_m2, %c128_i64_m2) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %__m2:1 = scf.for %offset_m2 = %c0_m2 to %c1024_m2 step %c64_m2 iter_args(%remaining_m2 = %c1024_i32_m2) -> (i32) { + %mask_m2, %next_remaining_m2 = pto.plt_b32 %remaining_m2 : i32 -> !pto.mask, i32 + %lhs_m2 = pto.vlds %ub_lhs_m2[%offset_m2] : !pto.ptr -> !pto.vreg<64xf32> + %rhs_m2 = pto.vlds %ub_rhs_m2[%offset_m2] : !pto.ptr -> !pto.vreg<64xf32> + %quot_m2 = pto.vdiv %lhs_m2, %rhs_m2, %mask_m2 : !pto.vreg<64xf32>, !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> + pto.vsts %quot_m2, %ub_out_m2[%offset_m2], %mask_m2 : !pto.vreg<64xf32>, !pto.ptr, !pto.mask + scf.yield %next_remaining_m2 : i32 + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m2, %arg8, %c128_i64_m2 + nburst(%c32_i64_m2, %c128_i64_m2, %c128_i64_m2) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + + // merged from vdiv_tail_kernel_2d via LaunchVdiv_tail_kernel_2d + + %c0_m3 = arith.constant 0 : index + %c64_m3 = arith.constant 64 : index + %c1024_m3 = arith.constant 1024 : index + %c0_i64_m3 = arith.constant 0 : i64 + %c1_i64_m3 = arith.constant 1 : i64 + %c32_i64_m3 = arith.constant 32 : i64 + %c128_i64_m3 = arith.constant 128 : i64 + %c4096_i64_m3 = arith.constant 4096 : i64 + %c8192_i64_m3 = arith.constant 8192 : i64 + %c1000_i32_m3 = arith.constant 1000 : i32 + + %ub_lhs_m3 = pto.castptr %c0_i64_m3 : i64 -> !pto.ptr + %ub_rhs_m3 = pto.castptr %c4096_i64_m3 : i64 -> !pto.ptr + %ub_out_m3 = pto.castptr %c8192_i64_m3 : i64 -> !pto.ptr + + %false_m3 = arith.constant false + pto.mte_gm_ub %arg9, %ub_lhs_m3, %c0_i64_m3, %c128_i64_m3 + nburst(%c32_i64_m3, %c128_i64_m3, %c128_i64_m3) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + pto.mte_gm_ub %arg10, %ub_rhs_m3, %c0_i64_m3, %c128_i64_m3 + nburst(%c32_i64_m3, %c128_i64_m3, %c128_i64_m3) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %__m3:1 = scf.for %offset_m3 = %c0_m3 to %c1024_m3 step %c64_m3 iter_args(%remaining_m3 = %c1000_i32_m3) -> (i32) { + %mask_m3, %next_remaining_m3 = pto.plt_b32 %remaining_m3 : i32 -> !pto.mask, i32 + %lhs_m3 = pto.vlds %ub_lhs_m3[%offset_m3] : !pto.ptr -> !pto.vreg<64xf32> + %rhs_m3 = pto.vlds %ub_rhs_m3[%offset_m3] : !pto.ptr -> !pto.vreg<64xf32> + %quot_m3 = pto.vdiv %lhs_m3, %rhs_m3, %mask_m3 : !pto.vreg<64xf32>, !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> + pto.vsts %quot_m3, %ub_out_m3[%offset_m3], %mask_m3 : !pto.vreg<64xf32>, !pto.ptr, !pto.mask + scf.yield %next_remaining_m3 : i32 + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m3, %arg11, %c128_i64_m3 + nburst(%c32_i64_m3, %c128_i64_m3, %c128_i64_m3) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + return } } diff --git a/test/vpto/cases/micro-op/binary-vector/vdiv/launch.cpp b/test/vpto/cases/micro-op/binary-vector/vdiv/launch.cpp index fd82d2921..63a93ddee 100644 --- a/test/vpto/cases/micro-op/binary-vector/vdiv/launch.cpp +++ b/test/vpto/cases/micro-op/binary-vector/vdiv/launch.cpp @@ -5,11 +5,10 @@ // THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, // INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. // See LICENSE in the root of the software repository for the full text of the License. - +// Merged launch wrappers #ifndef __VEC_SCOPE__ #define __VEC_SCOPE__ #endif - #if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) typedef struct { unsigned char v; } hifloat8_t; typedef struct { unsigned char v; } float8_e4m3_t; @@ -22,14 +21,8 @@ typedef struct { unsigned char v; } float4_e2m1x2_t; #if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) #include #endif - #if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; +struct MrgSortExecutedNumList { uint16_t mrgSortList0,mrgSortList1,mrgSortList2,mrgSortList3; }; #endif #ifndef __CPU_SIM #include "acl/acl.h" @@ -39,8 +32,32 @@ extern "C" __global__ [aicore] void div_kernel_2d(__gm__ float *v1, __gm__ float *v2, __gm__ float *v3); -void LaunchDiv_kernel_2d(float *v1, float *v2, float *v3, void *stream) { - div_kernel_2d<<<1, nullptr, stream>>>((__gm__ float *)v1, - (__gm__ float *)v2, - (__gm__ float *)v3); +extern "C" __global__ [aicore] void vdiv_deep_merged_kernel( + __gm__ float * arg0, + __gm__ float * arg1, + __gm__ float * arg2, + __gm__ half * arg3, + __gm__ half * arg4, + __gm__ half * arg5, + __gm__ float * arg6, + __gm__ float * arg7, + __gm__ float * arg8, + __gm__ float * arg9, + __gm__ float * arg10, + __gm__ float * arg11); + +void LaunchVdivDeepMerged(float * p0, float * p1, float * p2, uint16_t * p3, uint16_t * p4, uint16_t * p5, float * p6, float * p7, float * p8, float * p9, float * p10, float * p11, void *stream) { + vdiv_deep_merged_kernel<<<1, nullptr, stream>>>( + (__gm__ float *)p0, + (__gm__ float *)p1, + (__gm__ float *)p2, + (__gm__ half *)p3, + (__gm__ half *)p4, + (__gm__ half *)p5, + (__gm__ float *)p6, + (__gm__ float *)p7, + (__gm__ float *)p8, + (__gm__ float *)p9, + (__gm__ float *)p10, + (__gm__ float *)p11); } diff --git a/test/vpto/cases/micro-op/binary-vector/vdiv/main.cpp b/test/vpto/cases/micro-op/binary-vector/vdiv/main.cpp index 3972f99b3..45b2027ee 100644 --- a/test/vpto/cases/micro-op/binary-vector/vdiv/main.cpp +++ b/test/vpto/cases/micro-op/binary-vector/vdiv/main.cpp @@ -5,90 +5,149 @@ // THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, // INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. // See LICENSE in the root of the software repository for the full text of the License. - -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -*/ +// Merged vdiv host runner. #include "test_common.h" #include "acl/acl.h" #include #include - +#include using namespace PtoTestCommon; -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, \ - (int)_ret, __FILE__, __LINE__); \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) +#define ACL_CHECK(expr) do { \ + const aclError _ret = (expr); \ + if (_ret != ACL_SUCCESS) { std::fprintf(stderr,"[ERROR] %s:%d acle=%d\n",#expr,__LINE__,(int)_ret); rc=1; goto cleanup; } \ +} while(0) + +#define FCK(expr,path) do { if(!(expr)){std::fprintf(stderr,"[ERROR] file:%s\n",path);rc=1;goto cleanup;} } while(0) -void LaunchDiv_kernel_2d(float *v1, float *v2, float *v3, void *stream); + +void LaunchVdivDeepMerged(float * p0, float * p1, float * p2, uint16_t * p3, uint16_t * p4, uint16_t * p5, float * p6, float * p7, float * p8, float * p9, float * p10, float * p11, void *stream); int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(float); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(float); - size_t elemCount_v3 = 1024; - size_t fileSize_v3 = elemCount_v3 * sizeof(float); - float *v1Host = nullptr; - float *v1Device = nullptr; - float *v2Host = nullptr; - float *v2Device = nullptr; - float *v3Host = nullptr; - float *v3Device = nullptr; - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; + constexpr size_t SZ_f32 = 4096; + constexpr size_t SZ_f16 = 2048; + constexpr size_t SZ_f32_exceptional = 4096; + constexpr size_t SZ_tail = 4096; + + float *h_f32_v1=nullptr, *d_f32_v1=nullptr; + float *h_f32_v2=nullptr, *d_f32_v2=nullptr; + float *h_f32_v3=nullptr, *d_f32_v3=nullptr; + uint16_t *h_f16_v1=nullptr, *d_f16_v1=nullptr; + uint16_t *h_f16_v2=nullptr, *d_f16_v2=nullptr; + uint16_t *h_f16_v3=nullptr, *d_f16_v3=nullptr; + float *h_f32_exceptional_v1=nullptr, *d_f32_exceptional_v1=nullptr; + float *h_f32_exceptional_v2=nullptr, *d_f32_exceptional_v2=nullptr; + float *h_f32_exceptional_v3=nullptr, *d_f32_exceptional_v3=nullptr; + float *h_tail_v1=nullptr, *d_tail_v1=nullptr; + float *h_tail_v2=nullptr, *d_tail_v2=nullptr; + float *h_tail_v3=nullptr, *d_tail_v3=nullptr; - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) - deviceId = std::atoi(envDevice); - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; + int rc=0; bool aclInited=false,deviceSet=false; int deviceId=0; aclrtStream stream=nullptr; size_t fsz=0; + ACL_CHECK(aclInit(nullptr)); aclInited=true; + if(const char*e=std::getenv("ACL_DEVICE_ID")) deviceId=std::atoi(e); + ACL_CHECK(aclrtSetDevice(deviceId)); deviceSet=true; ACL_CHECK(aclrtCreateStream(&stream)); - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMallocHost((void **)(&v3Host), fileSize_v3)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v3Device, fileSize_v3, ACL_MEM_MALLOC_HUGE_FIRST)); - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ReadFile("./v3.bin", fileSize_v3, v3Host, fileSize_v3); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v3Device, fileSize_v3, v3Host, fileSize_v3, - ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchDiv_kernel_2d(v1Device, v2Device, v3Device, stream); + ACL_CHECK(aclrtMallocHost((void**)&h_f32_v1,SZ_f32)); + ACL_CHECK(aclrtMallocHost((void**)&h_f32_v2,SZ_f32)); + ACL_CHECK(aclrtMallocHost((void**)&h_f32_v3,SZ_f32)); + ACL_CHECK(aclrtMallocHost((void**)&h_f16_v1,SZ_f16)); + ACL_CHECK(aclrtMallocHost((void**)&h_f16_v2,SZ_f16)); + ACL_CHECK(aclrtMallocHost((void**)&h_f16_v3,SZ_f16)); + ACL_CHECK(aclrtMallocHost((void**)&h_f32_exceptional_v1,SZ_f32_exceptional)); + ACL_CHECK(aclrtMallocHost((void**)&h_f32_exceptional_v2,SZ_f32_exceptional)); + ACL_CHECK(aclrtMallocHost((void**)&h_f32_exceptional_v3,SZ_f32_exceptional)); + ACL_CHECK(aclrtMallocHost((void**)&h_tail_v1,SZ_tail)); + ACL_CHECK(aclrtMallocHost((void**)&h_tail_v2,SZ_tail)); + ACL_CHECK(aclrtMallocHost((void**)&h_tail_v3,SZ_tail)); + ACL_CHECK(aclrtMalloc((void**)&d_f32_v1,SZ_f32,ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc((void**)&d_f32_v2,SZ_f32,ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc((void**)&d_f32_v3,SZ_f32,ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc((void**)&d_f16_v1,SZ_f16,ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc((void**)&d_f16_v2,SZ_f16,ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc((void**)&d_f16_v3,SZ_f16,ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc((void**)&d_f32_exceptional_v1,SZ_f32_exceptional,ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc((void**)&d_f32_exceptional_v2,SZ_f32_exceptional,ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc((void**)&d_f32_exceptional_v3,SZ_f32_exceptional,ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc((void**)&d_tail_v1,SZ_tail,ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc((void**)&d_tail_v2,SZ_tail,ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc((void**)&d_tail_v3,SZ_tail,ACL_MEM_MALLOC_HUGE_FIRST)); + fsz=SZ_f32; FCK(ReadFile("v1.bin",fsz,h_f32_v1,SZ_f32)&&fsz==SZ_f32,"v1.bin"); + fsz=SZ_f32; FCK(ReadFile("v2.bin",fsz,h_f32_v2,SZ_f32)&&fsz==SZ_f32,"v2.bin"); + fsz=SZ_f32; FCK(ReadFile("v3.bin",fsz,h_f32_v3,SZ_f32)&&fsz==SZ_f32,"v3.bin"); + fsz=SZ_f16; FCK(ReadFile("v1_f16.bin",fsz,h_f16_v1,SZ_f16)&&fsz==SZ_f16,"v1_f16.bin"); + fsz=SZ_f16; FCK(ReadFile("v2_f16.bin",fsz,h_f16_v2,SZ_f16)&&fsz==SZ_f16,"v2_f16.bin"); + fsz=SZ_f16; FCK(ReadFile("v3_f16.bin",fsz,h_f16_v3,SZ_f16)&&fsz==SZ_f16,"v3_f16.bin"); + fsz=SZ_f32_exceptional; FCK(ReadFile("v1_f32_exceptional.bin",fsz,h_f32_exceptional_v1,SZ_f32_exceptional)&&fsz==SZ_f32_exceptional,"v1_f32_exceptional.bin"); + fsz=SZ_f32_exceptional; FCK(ReadFile("v2_f32_exceptional.bin",fsz,h_f32_exceptional_v2,SZ_f32_exceptional)&&fsz==SZ_f32_exceptional,"v2_f32_exceptional.bin"); + fsz=SZ_f32_exceptional; FCK(ReadFile("v3_f32_exceptional.bin",fsz,h_f32_exceptional_v3,SZ_f32_exceptional)&&fsz==SZ_f32_exceptional,"v3_f32_exceptional.bin"); + fsz=SZ_tail; FCK(ReadFile("v1_tail.bin",fsz,h_tail_v1,SZ_tail)&&fsz==SZ_tail,"v1_tail.bin"); + fsz=SZ_tail; FCK(ReadFile("v2_tail.bin",fsz,h_tail_v2,SZ_tail)&&fsz==SZ_tail,"v2_tail.bin"); + fsz=SZ_tail; FCK(ReadFile("v3_tail.bin",fsz,h_tail_v3,SZ_tail)&&fsz==SZ_tail,"v3_tail.bin"); + ACL_CHECK(aclrtMemcpy(d_f32_v1,SZ_f32,h_f32_v1,SZ_f32,ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(d_f32_v2,SZ_f32,h_f32_v2,SZ_f32,ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(d_f32_v3,SZ_f32,h_f32_v3,SZ_f32,ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(d_f16_v1,SZ_f16,h_f16_v1,SZ_f16,ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(d_f16_v2,SZ_f16,h_f16_v2,SZ_f16,ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(d_f16_v3,SZ_f16,h_f16_v3,SZ_f16,ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(d_f32_exceptional_v1,SZ_f32_exceptional,h_f32_exceptional_v1,SZ_f32_exceptional,ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(d_f32_exceptional_v2,SZ_f32_exceptional,h_f32_exceptional_v2,SZ_f32_exceptional,ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(d_f32_exceptional_v3,SZ_f32_exceptional,h_f32_exceptional_v3,SZ_f32_exceptional,ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(d_tail_v1,SZ_tail,h_tail_v1,SZ_tail,ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(d_tail_v2,SZ_tail,h_tail_v2,SZ_tail,ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(d_tail_v3,SZ_tail,h_tail_v3,SZ_tail,ACL_MEMCPY_HOST_TO_DEVICE)); + LaunchVdivDeepMerged( + d_f32_v1, + d_f32_v2, + d_f32_v3, + d_f16_v1, + d_f16_v2, + d_f16_v3, + d_f32_exceptional_v1, + d_f32_exceptional_v2, + d_f32_exceptional_v3, + d_tail_v1, + d_tail_v2, + d_tail_v3, + stream + ); ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v3Host, fileSize_v3, v3Device, fileSize_v3, - ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v3.bin", v3Host, fileSize_v3); + ACL_CHECK(aclrtMemcpy(h_f32_v3,SZ_f32,d_f32_v3,SZ_f32,ACL_MEMCPY_DEVICE_TO_HOST)); + ACL_CHECK(aclrtMemcpy(h_f16_v3,SZ_f16,d_f16_v3,SZ_f16,ACL_MEMCPY_DEVICE_TO_HOST)); + ACL_CHECK(aclrtMemcpy(h_f32_exceptional_v3,SZ_f32_exceptional,d_f32_exceptional_v3,SZ_f32_exceptional,ACL_MEMCPY_DEVICE_TO_HOST)); + ACL_CHECK(aclrtMemcpy(h_tail_v3,SZ_tail,d_tail_v3,SZ_tail,ACL_MEMCPY_DEVICE_TO_HOST)); + FCK(WriteFile("v3.bin",h_f32_v3,SZ_f32),"v3.bin"); + FCK(WriteFile("v3_f16.bin",h_f16_v3,SZ_f16),"v3_f16.bin"); + FCK(WriteFile("v3_f32_exceptional.bin",h_f32_exceptional_v3,SZ_f32_exceptional),"v3_f32_exceptional.bin"); + FCK(WriteFile("v3_tail.bin",h_tail_v3,SZ_tail),"v3_tail.bin"); cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFree(v3Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - aclrtFreeHost(v3Host); - if (stream != nullptr) - aclrtDestroyStream(stream); - if (deviceSet) - aclrtResetDevice(deviceId); - if (aclInited) - aclFinalize(); + aclrtFree(d_f32_v1); + aclrtFree(d_f32_v2); + aclrtFree(d_f32_v3); + aclrtFree(d_f16_v1); + aclrtFree(d_f16_v2); + aclrtFree(d_f16_v3); + aclrtFree(d_f32_exceptional_v1); + aclrtFree(d_f32_exceptional_v2); + aclrtFree(d_f32_exceptional_v3); + aclrtFree(d_tail_v1); + aclrtFree(d_tail_v2); + aclrtFree(d_tail_v3); + aclrtFreeHost(h_f32_v1); + aclrtFreeHost(h_f32_v2); + aclrtFreeHost(h_f32_v3); + aclrtFreeHost(h_f16_v1); + aclrtFreeHost(h_f16_v2); + aclrtFreeHost(h_f16_v3); + aclrtFreeHost(h_f32_exceptional_v1); + aclrtFreeHost(h_f32_exceptional_v2); + aclrtFreeHost(h_f32_exceptional_v3); + aclrtFreeHost(h_tail_v1); + aclrtFreeHost(h_tail_v2); + aclrtFreeHost(h_tail_v3); + if(stream) aclrtDestroyStream(stream); + if(deviceSet) aclrtResetDevice(deviceId); + if(aclInited) aclFinalize(); return rc; } diff --git a/test/vpto/cases/micro-op/binary-vector/vmax-tail/compare.py b/test/vpto/cases/micro-op/binary-vector/vmax-tail/compare.py deleted file mode 100644 index c95419953..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vmax-tail/compare.py +++ /dev/null @@ -1,39 +0,0 @@ -#!/usr/bin/python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - - -import os -import sys -import numpy as np - - -def compare_bin_prefix(golden_path, output_path, dtype, eps, count): - if not os.path.exists(golden_path) or not os.path.exists(output_path): - return False - golden = np.fromfile(golden_path, dtype=dtype, count=count) - output = np.fromfile(output_path, dtype=dtype, count=count) - return golden.shape == output.shape and np.allclose( - golden, output, atol=eps, rtol=eps, equal_nan=True - ) - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_bin_prefix("golden_v3.bin", "v3.bin", np.float32, 1e-4, 1000) - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/binary-vector/vmax-tail/golden.py b/test/vpto/cases/micro-op/binary-vector/vmax-tail/golden.py deleted file mode 100644 index 82d3beb41..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vmax-tail/golden.py +++ /dev/null @@ -1,51 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -ROWS = 32 -COLS = 32 -SEED = 19 -LOGICAL_ELEMS = 1000 -OUT_SENTINEL = np.float32(-123.25) - - -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) - v1 = rng.random((ROWS, COLS), dtype=np.float32) - v2 = rng.random((ROWS, COLS), dtype=np.float32) - v3 = np.full((ROWS, COLS), OUT_SENTINEL, dtype=np.float32) - golden_v3 = np.full((ROWS, COLS), OUT_SENTINEL, dtype=np.float32) - golden_v3.reshape(-1)[:LOGICAL_ELEMS] = np.maximum( - v1.reshape(-1)[:LOGICAL_ELEMS], v2.reshape(-1)[:LOGICAL_ELEMS] - ).astype(np.float32, copy=False) - - output_dir.mkdir(parents=True, exist_ok=True) - v1.reshape(-1).tofile(output_dir / "v1.bin") - v2.reshape(-1).tofile(output_dir / "v2.bin") - v3.reshape(-1).tofile(output_dir / "v3.bin") - golden_v3.reshape(-1).tofile(output_dir / "golden_v3.bin") - - -def main() -> None: - parser = argparse.ArgumentParser() - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/binary-vector/vmax-tail/kernel.pto b/test/vpto/cases/micro-op/binary-vector/vmax-tail/kernel.pto deleted file mode 100644 index 8bfae2fd0..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vmax-tail/kernel.pto +++ /dev/null @@ -1,49 +0,0 @@ -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vmax_tail_kernel_2d(%arg0: !pto.ptr, %arg1: !pto.ptr, - %arg2: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c64 = arith.constant 64 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c128_i64 = arith.constant 128 : i64 - %c4096_i64 = arith.constant 4096 : i64 - %c8192_i64 = arith.constant 8192 : i64 - %c1000_i32 = arith.constant 1000 : i32 - - %ub_lhs = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_rhs = pto.castptr %c4096_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c8192_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_lhs, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - pto.mte_gm_ub %arg1, %ub_rhs, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %_:1 = scf.for %offset = %c0 to %c1024 step %c64 iter_args(%remaining = %c1000_i32) -> (i32) { - %mask, %next_remaining = pto.plt_b32 %remaining : i32 -> !pto.mask, i32 - %lhs = pto.vlds %ub_lhs[%offset] : !pto.ptr -> !pto.vreg<64xf32> - %rhs = pto.vlds %ub_rhs[%offset] : !pto.ptr -> !pto.vreg<64xf32> - %maxv = pto.vmax %lhs, %rhs, %mask : !pto.vreg<64xf32>, !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> - pto.vsts %maxv, %ub_out[%offset], %mask : !pto.vreg<64xf32>, !pto.ptr, !pto.mask - scf.yield %next_remaining : i32 - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg2, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/binary-vector/vmax-tail/launch.cpp b/test/vpto/cases/micro-op/binary-vector/vmax-tail/launch.cpp deleted file mode 100644 index bab607bfa..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vmax-tail/launch.cpp +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vmax_tail_kernel_2d(__gm__ float *v1, - __gm__ float *v2, - __gm__ float *v3); - -void LaunchVadd_tail_kernel_2d(float *v1, float *v2, float *v3, void *stream) { - vmax_tail_kernel_2d<<<1, nullptr, stream>>>((__gm__ float *)v1, - (__gm__ float *)v2, - (__gm__ float *)v3); -} diff --git a/test/vpto/cases/micro-op/binary-vector/vmax-tail/main.cpp b/test/vpto/cases/micro-op/binary-vector/vmax-tail/main.cpp deleted file mode 100644 index 40a9881d6..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vmax-tail/main.cpp +++ /dev/null @@ -1,94 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include - -using namespace PtoTestCommon; - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, \ - (int)_ret, __FILE__, __LINE__); \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVadd_tail_kernel_2d(float *v1, float *v2, float *v3, void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(float); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(float); - size_t elemCount_v3 = 1024; - size_t fileSize_v3 = elemCount_v3 * sizeof(float); - float *v1Host = nullptr; - float *v1Device = nullptr; - float *v2Host = nullptr; - float *v2Device = nullptr; - float *v3Host = nullptr; - float *v3Device = nullptr; - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) - deviceId = std::atoi(envDevice); - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMallocHost((void **)(&v3Host), fileSize_v3)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v3Device, fileSize_v3, ACL_MEM_MALLOC_HUGE_FIRST)); - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ReadFile("./v3.bin", fileSize_v3, v3Host, fileSize_v3); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v3Device, fileSize_v3, v3Host, fileSize_v3, - ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVadd_tail_kernel_2d(v1Device, v2Device, v3Device, stream); - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v3Host, fileSize_v3, v3Device, fileSize_v3, - ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v3.bin", v3Host, fileSize_v3); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFree(v3Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - aclrtFreeHost(v3Host); - if (stream != nullptr) - aclrtDestroyStream(stream); - if (deviceSet) - aclrtResetDevice(deviceId); - if (aclInited) - aclFinalize(); - return rc; -} diff --git a/test/vpto/cases/micro-op/binary-vector/vmax/compare.py b/test/vpto/cases/micro-op/binary-vector/vmax/compare.py index a5f14dabc..9c988222c 100644 --- a/test/vpto/cases/micro-op/binary-vector/vmax/compare.py +++ b/test/vpto/cases/micro-op/binary-vector/vmax/compare.py @@ -7,31 +7,42 @@ # INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. # See LICENSE in the root of the software repository for the full text of the License. +# Merged vmax test case. -import os -import sys +import os,sys import numpy as np +def _cmp(golden,output,dtype,eps,count=-1): + if not os.path.exists(golden) or not os.path.exists(output): return False + kw={} if count<0 else {"count":count} + g=np.fromfile(golden,dtype=dtype,**kw) + o=np.fromfile(output,dtype=dtype,**kw) + return g.shape==o.shape and np.allclose(g,o,atol=eps,rtol=eps,equal_nan=True) -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(golden_path) or not os.path.exists(output_path): - return False - golden = np.fromfile(golden_path, dtype=dtype) - output = np.fromfile(output_path, dtype=dtype) - return golden.shape == output.shape and np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True) - +def _cmpeq(golden,output,dtype): + if not os.path.exists(golden) or not os.path.exists(output): return False + g=np.fromfile(golden,dtype=dtype) + o=np.fromfile(output,dtype=dtype) + return g.shape==o.shape and np.array_equal(g,o) def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_bin("golden_v3.bin", "v3.bin", np.float32, 1e-4) - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") + strict=os.getenv('COMPARE_STRICT','1')!='0' + failed=[] + if not (_cmp("golden_v3.bin","v3.bin",np.float32,1e-4,-1)): + failed.append('f32') + print('[ERROR] compare failed: f32') + else: + print('[INFO] f32: passed') + if not (_cmp("golden_v3_tail.bin","v3_tail.bin",np.float32,1e-4,1000)): + failed.append('tail') + print('[ERROR] compare failed: tail') + else: + print('[INFO] tail: passed') + if failed: + if strict: print(f"[ERROR] {len(failed)} variant(s) failed"); sys.exit(2) + print(f"[WARN] {len(failed)} variant(s) failed (non-gating)") return - print("[INFO] compare passed") - + print("[INFO] compare passed (all 2 variants)") -if __name__ == "__main__": +if __name__=="__main__": main() diff --git a/test/vpto/cases/micro-op/binary-vector/vmax/golden.py b/test/vpto/cases/micro-op/binary-vector/vmax/golden.py index aca780439..26bc3b329 100644 --- a/test/vpto/cases/micro-op/binary-vector/vmax/golden.py +++ b/test/vpto/cases/micro-op/binary-vector/vmax/golden.py @@ -7,40 +7,63 @@ # INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. # See LICENSE in the root of the software repository for the full text of the License. -# coding=utf-8 +# Merged vmax test case. import argparse from pathlib import Path - import numpy as np - ROWS = 32 COLS = 32 SEED = 19 +LOGICAL_ELEMS = 1000 +OUT_SENTINEL = np.float32(-123.25) +def f32_to_bf16_bits(v): + w=v.astype(np.float32,copy=False).view(np.uint32) + r=np.uint32(0x7FFF)+((w>>16)&np.uint32(1)) + return ((w+r)>>16).astype(np.uint16) +def bf16_bits_to_f32(b): + return (b.astype(np.uint32)<<16).view(np.float32) -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) - v1 = rng.uniform(-8.0, 8.0, size=(ROWS, COLS)).astype(np.float32) - v2 = rng.uniform(-8.0, 8.0, size=(ROWS, COLS)).astype(np.float32) - golden_v3 = np.maximum(v1, v2).astype(np.float32, copy=False) - v3 = np.zeros((ROWS, COLS), dtype=np.float32) - output_dir.mkdir(parents=True, exist_ok=True) - v1.reshape(-1).tofile(output_dir / "v1.bin") - v2.reshape(-1).tofile(output_dir / "v2.bin") - v3.reshape(-1).tofile(output_dir / "v3.bin") - golden_v3.reshape(-1).tofile(output_dir / "golden_v3.bin") +# ---- f32 ---- +def gen_f32(out, rng): + v1=rng.uniform(-8.0,8.0,size=(ROWS,COLS)).astype(np.float32) + v2=rng.uniform(-8.0,8.0,size=(ROWS,COLS)).astype(np.float32) + g=np.maximum(v1,v2).astype(np.float32,copy=False) + v3=np.zeros((ROWS,COLS),dtype=np.float32) + v1.reshape(-1).tofile(out/"v1.bin") + v2.reshape(-1).tofile(out/"v2.bin") + v3.reshape(-1).tofile(out/"v3.bin") + g.reshape(-1).tofile(out/"golden_v3.bin") +# ---- tail ---- +def gen_tail(out, rng): + v1=rng.random((ROWS,COLS),dtype=np.float32) + v2=rng.random((ROWS,COLS),dtype=np.float32) + v3=np.full((ROWS,COLS),OUT_SENTINEL,dtype=np.float32) + g=np.full((ROWS,COLS),OUT_SENTINEL,dtype=np.float32) + g.reshape(-1)[:LOGICAL_ELEMS]=np.maximum(v1.reshape(-1)[:LOGICAL_ELEMS],v2.reshape(-1)[:LOGICAL_ELEMS]).astype(np.float32,copy=False) + v1.reshape(-1).tofile(out/"v1_tail.bin") + v2.reshape(-1).tofile(out/"v2_tail.bin") + v3.reshape(-1).tofile(out/"v3_tail.bin") + g.reshape(-1).tofile(out/"golden_v3_tail.bin") -def main() -> None: - parser = argparse.ArgumentParser() - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) +GENERATORS = [ + gen_f32, + gen_tail, +] +def main(): + p=argparse.ArgumentParser() + p.add_argument("--output-dir",type=Path,default=Path(".")) + p.add_argument("--seed",type=int,default=SEED) + a=p.parse_args() + rng=np.random.default_rng(a.seed) + out=a.output_dir; out.mkdir(parents=True,exist_ok=True) + for gen in GENERATORS: + gen(out,rng) -if __name__ == "__main__": +if __name__=="__main__": main() diff --git a/test/vpto/cases/micro-op/binary-vector/vmax/kernel.pto b/test/vpto/cases/micro-op/binary-vector/vmax/kernel.pto index 8c7835c64..1d6193257 100644 --- a/test/vpto/cases/micro-op/binary-vector/vmax/kernel.pto +++ b/test/vpto/cases/micro-op/binary-vector/vmax/kernel.pto @@ -1,50 +1,293 @@ module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @max_kernel_2d(%arg0: !pto.ptr, %arg1: !pto.ptr, - %arg2: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c64 = arith.constant 64 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c128_i64 = arith.constant 128 : i64 - %c4096_i64 = arith.constant 4096 : i64 - %c8192_i64 = arith.constant 8192 : i64 - %c1024_i32 = arith.constant 1024 : i32 - - %ub_lhs = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_rhs = pto.castptr %c4096_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c8192_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_lhs, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) + func.func @vmax_deep_merged_kernel(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: !pto.ptr, %arg4: !pto.ptr, %arg5: !pto.ptr) attributes {pto.kernel} { + // merged from max_kernel_2d via LaunchMax_kernel_2d + + %c0_m0 = arith.constant 0 : index + %c1_m0 = arith.constant 1 : index + %c64_m0 = arith.constant 64 : index + %c1024_m0 = arith.constant 1024 : index + %c0_i64_m0 = arith.constant 0 : i64 + %c1_i64_m0 = arith.constant 1 : i64 + %c32_i64_m0 = arith.constant 32 : i64 + %c128_i64_m0 = arith.constant 128 : i64 + %c4096_i64_m0 = arith.constant 4096 : i64 + %c8192_i64_m0 = arith.constant 8192 : i64 + %c1024_i32_m0 = arith.constant 1024 : i32 + + %ub_lhs_m0 = pto.castptr %c0_i64_m0 : i64 -> !pto.ptr + %ub_rhs_m0 = pto.castptr %c4096_i64_m0 : i64 -> !pto.ptr + %ub_out_m0 = pto.castptr %c8192_i64_m0 : i64 -> !pto.ptr + + %false_m0 = arith.constant false + pto.mte_gm_ub %arg0, %ub_lhs_m0, %c0_i64_m0, %c128_i64_m0 + nburst(%c32_i64_m0, %c128_i64_m0, %c128_i64_m0) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + pto.mte_gm_ub %arg1, %ub_rhs_m0, %c0_i64_m0, %c128_i64_m0 + nburst(%c32_i64_m0, %c128_i64_m0, %c128_i64_m0) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %__m0:1 = scf.for %offset_m0 = %c0_m0 to %c1024_m0 step %c64_m0 iter_args(%remaining_m0 = %c1024_i32_m0) -> (i32) { + %mask_m0, %next_remaining_m0 = pto.plt_b32 %remaining_m0 : i32 -> !pto.mask, i32 + %lhs_m0 = pto.vlds %ub_lhs_m0[%offset_m0] : !pto.ptr -> !pto.vreg<64xf32> + %rhs_m0 = pto.vlds %ub_rhs_m0[%offset_m0] : !pto.ptr -> !pto.vreg<64xf32> + %maxv_m0 = pto.vmax %lhs_m0, %rhs_m0, %mask_m0 : !pto.vreg<64xf32>, !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> + pto.vsts %maxv_m0, %ub_out_m0[%offset_m0], %mask_m0 : !pto.vreg<64xf32>, !pto.ptr, !pto.mask + scf.yield %next_remaining_m0 : i32 + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m0, %arg2, %c128_i64_m0 + nburst(%c32_i64_m0, %c128_i64_m0, %c128_i64_m0) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + + // merged from vmax_tail_kernel_2d via LaunchVmax_tail_kernel_2d + + %c0_m1 = arith.constant 0 : index + %c64_m1 = arith.constant 64 : index + %c1024_m1 = arith.constant 1024 : index + %c0_i64_m1 = arith.constant 0 : i64 + %c1_i64_m1 = arith.constant 1 : i64 + %c32_i64_m1 = arith.constant 32 : i64 + %c128_i64_m1 = arith.constant 128 : i64 + %c4096_i64_m1 = arith.constant 4096 : i64 + %c8192_i64_m1 = arith.constant 8192 : i64 + %c1000_i32_m1 = arith.constant 1000 : i32 + + %ub_lhs_m1 = pto.castptr %c0_i64_m1 : i64 -> !pto.ptr + %ub_rhs_m1 = pto.castptr %c4096_i64_m1 : i64 -> !pto.ptr + %ub_out_m1 = pto.castptr %c8192_i64_m1 : i64 -> !pto.ptr + + %false_m1 = arith.constant false + pto.mte_gm_ub %arg3, %ub_lhs_m1, %c0_i64_m1, %c128_i64_m1 + nburst(%c32_i64_m1, %c128_i64_m1, %c128_i64_m1) : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - pto.mte_gm_ub %arg1, %ub_rhs, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) + pto.mte_gm_ub %arg4, %ub_rhs_m1, %c0_i64_m1, %c128_i64_m1 + nburst(%c32_i64_m1, %c128_i64_m1, %c128_i64_m1) : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] pto.vecscope { - %_:1 = scf.for %offset = %c0 to %c1024 step %c64 iter_args(%remaining = %c1024_i32) -> (i32) { - %mask, %next_remaining = pto.plt_b32 %remaining : i32 -> !pto.mask, i32 - %lhs = pto.vlds %ub_lhs[%offset] : !pto.ptr -> !pto.vreg<64xf32> - %rhs = pto.vlds %ub_rhs[%offset] : !pto.ptr -> !pto.vreg<64xf32> - %maxv = pto.vmax %lhs, %rhs, %mask : !pto.vreg<64xf32>, !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> - pto.vsts %maxv, %ub_out[%offset], %mask : !pto.vreg<64xf32>, !pto.ptr, !pto.mask - scf.yield %next_remaining : i32 + %__m1:1 = scf.for %offset_m1 = %c0_m1 to %c1024_m1 step %c64_m1 iter_args(%remaining_m1 = %c1000_i32_m1) -> (i32) { + %mask_m1, %next_remaining_m1 = pto.plt_b32 %remaining_m1 : i32 -> !pto.mask, i32 + %lhs_m1 = pto.vlds %ub_lhs_m1[%offset_m1] : !pto.ptr -> !pto.vreg<64xf32> + %rhs_m1 = pto.vlds %ub_rhs_m1[%offset_m1] : !pto.ptr -> !pto.vreg<64xf32> + %maxv_m1 = pto.vmax %lhs_m1, %rhs_m1, %mask_m1 : !pto.vreg<64xf32>, !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> + pto.vsts %maxv_m1, %ub_out_m1[%offset_m1], %mask_m1 : !pto.vreg<64xf32>, !pto.ptr, !pto.mask + scf.yield %next_remaining_m1 : i32 } } pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg2, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) + pto.mte_ub_gm %ub_out_m1, %arg5, %c128_i64_m1 + nburst(%c32_i64_m1, %c128_i64_m1, %c128_i64_m1) : !pto.ptr, !pto.ptr, i64, i64, i64, i64 pto.barrier #pto.pipe + %__case_merge_guard = arith.constant false + + // inactive cross-case merge from micro-op/binary-vector/vmul + scf.if %__case_merge_guard { + + // merged from mul_kernel_2d via LaunchMul_kernel_2d + + %c0_m0_cmg2_1 = arith.constant 0 : index + %c1_m0_cmg2_1 = arith.constant 1 : index + %c64_m0_cmg2_1 = arith.constant 64 : index + %c1024_m0_cmg2_1 = arith.constant 1024 : index + %c0_i64_m0_cmg2_1 = arith.constant 0 : i64 + %c1_i64_m0_cmg2_1 = arith.constant 1 : i64 + %c32_i64_m0_cmg2_1 = arith.constant 32 : i64 + %c128_i64_m0_cmg2_1 = arith.constant 128 : i64 + %c4096_i64_m0_cmg2_1 = arith.constant 4096 : i64 + %c8192_i64_m0_cmg2_1 = arith.constant 8192 : i64 + %c1024_i32_m0_cmg2_1 = arith.constant 1024 : i32 + + %ub_lhs_m0_cmg2_1 = pto.castptr %c0_i64_m0_cmg2_1 : i64 -> !pto.ptr + %ub_rhs_m0_cmg2_1 = pto.castptr %c4096_i64_m0_cmg2_1 : i64 -> !pto.ptr + %ub_out_m0_cmg2_1 = pto.castptr %c8192_i64_m0_cmg2_1 : i64 -> !pto.ptr + + %false_m0_cmg2_1 = arith.constant false + pto.mte_gm_ub %arg0, %ub_lhs_m0_cmg2_1, %c0_i64_m0_cmg2_1, %c128_i64_m0_cmg2_1 + nburst(%c32_i64_m0_cmg2_1, %c128_i64_m0_cmg2_1, %c128_i64_m0_cmg2_1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + pto.mte_gm_ub %arg1, %ub_rhs_m0_cmg2_1, %c0_i64_m0_cmg2_1, %c128_i64_m0_cmg2_1 + nburst(%c32_i64_m0_cmg2_1, %c128_i64_m0_cmg2_1, %c128_i64_m0_cmg2_1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %__m0_cmg2_1:1 = scf.for %offset_m0_cmg2_1 = %c0_m0_cmg2_1 to %c1024_m0_cmg2_1 step %c64_m0_cmg2_1 iter_args(%remaining_m0_cmg2_1 = %c1024_i32_m0_cmg2_1) -> (i32) { + %mask_m0_cmg2_1, %next_remaining_m0_cmg2_1 = pto.plt_b32 %remaining_m0_cmg2_1 : i32 -> !pto.mask, i32 + %lhs_m0_cmg2_1 = pto.vlds %ub_lhs_m0_cmg2_1[%offset_m0_cmg2_1] : !pto.ptr -> !pto.vreg<64xf32> + %rhs_m0_cmg2_1 = pto.vlds %ub_rhs_m0_cmg2_1[%offset_m0_cmg2_1] : !pto.ptr -> !pto.vreg<64xf32> + %prod_m0_cmg2_1 = pto.vmul %lhs_m0_cmg2_1, %rhs_m0_cmg2_1, %mask_m0_cmg2_1 : !pto.vreg<64xf32>, !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> + pto.vsts %prod_m0_cmg2_1, %ub_out_m0_cmg2_1[%offset_m0_cmg2_1], %mask_m0_cmg2_1 : !pto.vreg<64xf32>, !pto.ptr, !pto.mask + scf.yield %next_remaining_m0_cmg2_1 : i32 + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m0_cmg2_1, %arg2, %c128_i64_m0_cmg2_1 + nburst(%c32_i64_m0_cmg2_1, %c128_i64_m0_cmg2_1, %c128_i64_m0_cmg2_1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + + // merged from vmul_tail_kernel_2d via LaunchVmul_tail_kernel_2d + + %c0_m1_cmg2_1 = arith.constant 0 : index + %c64_m1_cmg2_1 = arith.constant 64 : index + %c1024_m1_cmg2_1 = arith.constant 1024 : index + %c0_i64_m1_cmg2_1 = arith.constant 0 : i64 + %c1_i64_m1_cmg2_1 = arith.constant 1 : i64 + %c32_i64_m1_cmg2_1 = arith.constant 32 : i64 + %c128_i64_m1_cmg2_1 = arith.constant 128 : i64 + %c4096_i64_m1_cmg2_1 = arith.constant 4096 : i64 + %c8192_i64_m1_cmg2_1 = arith.constant 8192 : i64 + %c1000_i32_m1_cmg2_1 = arith.constant 1000 : i32 + + %ub_lhs_m1_cmg2_1 = pto.castptr %c0_i64_m1_cmg2_1 : i64 -> !pto.ptr + %ub_rhs_m1_cmg2_1 = pto.castptr %c4096_i64_m1_cmg2_1 : i64 -> !pto.ptr + %ub_out_m1_cmg2_1 = pto.castptr %c8192_i64_m1_cmg2_1 : i64 -> !pto.ptr + + %false_m1_cmg2_1 = arith.constant false + pto.mte_gm_ub %arg3, %ub_lhs_m1_cmg2_1, %c0_i64_m1_cmg2_1, %c128_i64_m1_cmg2_1 + nburst(%c32_i64_m1_cmg2_1, %c128_i64_m1_cmg2_1, %c128_i64_m1_cmg2_1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + pto.mte_gm_ub %arg4, %ub_rhs_m1_cmg2_1, %c0_i64_m1_cmg2_1, %c128_i64_m1_cmg2_1 + nburst(%c32_i64_m1_cmg2_1, %c128_i64_m1_cmg2_1, %c128_i64_m1_cmg2_1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %__m1_cmg2_1:1 = scf.for %offset_m1_cmg2_1 = %c0_m1_cmg2_1 to %c1024_m1_cmg2_1 step %c64_m1_cmg2_1 iter_args(%remaining_m1_cmg2_1 = %c1000_i32_m1_cmg2_1) -> (i32) { + %mask_m1_cmg2_1, %next_remaining_m1_cmg2_1 = pto.plt_b32 %remaining_m1_cmg2_1 : i32 -> !pto.mask, i32 + %lhs_m1_cmg2_1 = pto.vlds %ub_lhs_m1_cmg2_1[%offset_m1_cmg2_1] : !pto.ptr -> !pto.vreg<64xf32> + %rhs_m1_cmg2_1 = pto.vlds %ub_rhs_m1_cmg2_1[%offset_m1_cmg2_1] : !pto.ptr -> !pto.vreg<64xf32> + %prod_m1_cmg2_1 = pto.vmul %lhs_m1_cmg2_1, %rhs_m1_cmg2_1, %mask_m1_cmg2_1 : !pto.vreg<64xf32>, !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> + pto.vsts %prod_m1_cmg2_1, %ub_out_m1_cmg2_1[%offset_m1_cmg2_1], %mask_m1_cmg2_1 : !pto.vreg<64xf32>, !pto.ptr, !pto.mask + scf.yield %next_remaining_m1_cmg2_1 : i32 + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m1_cmg2_1, %arg5, %c128_i64_m1_cmg2_1 + nburst(%c32_i64_m1_cmg2_1, %c128_i64_m1_cmg2_1, %c128_i64_m1_cmg2_1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + } + + // inactive cross-case merge from micro-op/binary-vector/vsub + scf.if %__case_merge_guard { + + // merged from sub_kernel_2d via LaunchSub_kernel_2d + + %c0_m0_cmg2_2 = arith.constant 0 : index + %c1_m0_cmg2_2 = arith.constant 1 : index + %c64_m0_cmg2_2 = arith.constant 64 : index + %c1024_m0_cmg2_2 = arith.constant 1024 : index + %c0_i64_m0_cmg2_2 = arith.constant 0 : i64 + %c1_i64_m0_cmg2_2 = arith.constant 1 : i64 + %c32_i64_m0_cmg2_2 = arith.constant 32 : i64 + %c128_i64_m0_cmg2_2 = arith.constant 128 : i64 + %c4096_i64_m0_cmg2_2 = arith.constant 4096 : i64 + %c8192_i64_m0_cmg2_2 = arith.constant 8192 : i64 + %c1024_i32_m0_cmg2_2 = arith.constant 1024 : i32 + + %ub_lhs_m0_cmg2_2 = pto.castptr %c0_i64_m0_cmg2_2 : i64 -> !pto.ptr + %ub_rhs_m0_cmg2_2 = pto.castptr %c4096_i64_m0_cmg2_2 : i64 -> !pto.ptr + %ub_out_m0_cmg2_2 = pto.castptr %c8192_i64_m0_cmg2_2 : i64 -> !pto.ptr + + %false_m0_cmg2_2 = arith.constant false + pto.mte_gm_ub %arg0, %ub_lhs_m0_cmg2_2, %c0_i64_m0_cmg2_2, %c128_i64_m0_cmg2_2 + nburst(%c32_i64_m0_cmg2_2, %c128_i64_m0_cmg2_2, %c128_i64_m0_cmg2_2) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + pto.mte_gm_ub %arg1, %ub_rhs_m0_cmg2_2, %c0_i64_m0_cmg2_2, %c128_i64_m0_cmg2_2 + nburst(%c32_i64_m0_cmg2_2, %c128_i64_m0_cmg2_2, %c128_i64_m0_cmg2_2) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %__m0_cmg2_2:1 = scf.for %offset_m0_cmg2_2 = %c0_m0_cmg2_2 to %c1024_m0_cmg2_2 step %c64_m0_cmg2_2 iter_args(%remaining_m0_cmg2_2 = %c1024_i32_m0_cmg2_2) -> (i32) { + %mask_m0_cmg2_2, %next_remaining_m0_cmg2_2 = pto.plt_b32 %remaining_m0_cmg2_2 : i32 -> !pto.mask, i32 + %lhs_m0_cmg2_2 = pto.vlds %ub_lhs_m0_cmg2_2[%offset_m0_cmg2_2] : !pto.ptr -> !pto.vreg<64xf32> + %rhs_m0_cmg2_2 = pto.vlds %ub_rhs_m0_cmg2_2[%offset_m0_cmg2_2] : !pto.ptr -> !pto.vreg<64xf32> + %diff_m0_cmg2_2 = pto.vsub %lhs_m0_cmg2_2, %rhs_m0_cmg2_2, %mask_m0_cmg2_2 : !pto.vreg<64xf32>, !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> + pto.vsts %diff_m0_cmg2_2, %ub_out_m0_cmg2_2[%offset_m0_cmg2_2], %mask_m0_cmg2_2 : !pto.vreg<64xf32>, !pto.ptr, !pto.mask + scf.yield %next_remaining_m0_cmg2_2 : i32 + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m0_cmg2_2, %arg2, %c128_i64_m0_cmg2_2 + nburst(%c32_i64_m0_cmg2_2, %c128_i64_m0_cmg2_2, %c128_i64_m0_cmg2_2) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + + // merged from vsub_tail_kernel_2d via LaunchVsub_tail_kernel_2d + + %c0_m1_cmg2_2 = arith.constant 0 : index + %c64_m1_cmg2_2 = arith.constant 64 : index + %c1024_m1_cmg2_2 = arith.constant 1024 : index + %c0_i64_m1_cmg2_2 = arith.constant 0 : i64 + %c1_i64_m1_cmg2_2 = arith.constant 1 : i64 + %c32_i64_m1_cmg2_2 = arith.constant 32 : i64 + %c128_i64_m1_cmg2_2 = arith.constant 128 : i64 + %c4096_i64_m1_cmg2_2 = arith.constant 4096 : i64 + %c8192_i64_m1_cmg2_2 = arith.constant 8192 : i64 + %c1000_i32_m1_cmg2_2 = arith.constant 1000 : i32 + + %ub_lhs_m1_cmg2_2 = pto.castptr %c0_i64_m1_cmg2_2 : i64 -> !pto.ptr + %ub_rhs_m1_cmg2_2 = pto.castptr %c4096_i64_m1_cmg2_2 : i64 -> !pto.ptr + %ub_out_m1_cmg2_2 = pto.castptr %c8192_i64_m1_cmg2_2 : i64 -> !pto.ptr + + %false_m1_cmg2_2 = arith.constant false + pto.mte_gm_ub %arg3, %ub_lhs_m1_cmg2_2, %c0_i64_m1_cmg2_2, %c128_i64_m1_cmg2_2 + nburst(%c32_i64_m1_cmg2_2, %c128_i64_m1_cmg2_2, %c128_i64_m1_cmg2_2) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + pto.mte_gm_ub %arg4, %ub_rhs_m1_cmg2_2, %c0_i64_m1_cmg2_2, %c128_i64_m1_cmg2_2 + nburst(%c32_i64_m1_cmg2_2, %c128_i64_m1_cmg2_2, %c128_i64_m1_cmg2_2) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %__m1_cmg2_2:1 = scf.for %offset_m1_cmg2_2 = %c0_m1_cmg2_2 to %c1024_m1_cmg2_2 step %c64_m1_cmg2_2 iter_args(%remaining_m1_cmg2_2 = %c1000_i32_m1_cmg2_2) -> (i32) { + %mask_m1_cmg2_2, %next_remaining_m1_cmg2_2 = pto.plt_b32 %remaining_m1_cmg2_2 : i32 -> !pto.mask, i32 + %lhs_m1_cmg2_2 = pto.vlds %ub_lhs_m1_cmg2_2[%offset_m1_cmg2_2] : !pto.ptr -> !pto.vreg<64xf32> + %rhs_m1_cmg2_2 = pto.vlds %ub_rhs_m1_cmg2_2[%offset_m1_cmg2_2] : !pto.ptr -> !pto.vreg<64xf32> + %diff_m1_cmg2_2 = pto.vsub %lhs_m1_cmg2_2, %rhs_m1_cmg2_2, %mask_m1_cmg2_2 : !pto.vreg<64xf32>, !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> + pto.vsts %diff_m1_cmg2_2, %ub_out_m1_cmg2_2[%offset_m1_cmg2_2], %mask_m1_cmg2_2 : !pto.vreg<64xf32>, !pto.ptr, !pto.mask + scf.yield %next_remaining_m1_cmg2_2 : i32 + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m1_cmg2_2, %arg5, %c128_i64_m1_cmg2_2 + nburst(%c32_i64_m1_cmg2_2, %c128_i64_m1_cmg2_2, %c128_i64_m1_cmg2_2) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + } + return } } diff --git a/test/vpto/cases/micro-op/binary-vector/vmax/launch.cpp b/test/vpto/cases/micro-op/binary-vector/vmax/launch.cpp index 44e917951..c208efc91 100644 --- a/test/vpto/cases/micro-op/binary-vector/vmax/launch.cpp +++ b/test/vpto/cases/micro-op/binary-vector/vmax/launch.cpp @@ -5,11 +5,10 @@ // THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, // INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. // See LICENSE in the root of the software repository for the full text of the License. - +// Merged launch wrappers #ifndef __VEC_SCOPE__ #define __VEC_SCOPE__ #endif - #if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) typedef struct { unsigned char v; } hifloat8_t; typedef struct { unsigned char v; } float8_e4m3_t; @@ -22,14 +21,8 @@ typedef struct { unsigned char v; } float4_e2m1x2_t; #if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) #include #endif - #if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; +struct MrgSortExecutedNumList { uint16_t mrgSortList0,mrgSortList1,mrgSortList2,mrgSortList3; }; #endif #ifndef __CPU_SIM #include "acl/acl.h" @@ -39,8 +32,20 @@ extern "C" __global__ [aicore] void max_kernel_2d(__gm__ float *v1, __gm__ float *v2, __gm__ float *v3); -void LaunchMax_kernel_2d(float *v1, float *v2, float *v3, void *stream) { - max_kernel_2d<<<1, nullptr, stream>>>((__gm__ float *)v1, - (__gm__ float *)v2, - (__gm__ float *)v3); +extern "C" __global__ [aicore] void vmax_deep_merged_kernel( + __gm__ float * arg0, + __gm__ float * arg1, + __gm__ float * arg2, + __gm__ float * arg3, + __gm__ float * arg4, + __gm__ float * arg5); + +void LaunchVmaxDeepMerged(float * p0, float * p1, float * p2, float * p3, float * p4, float * p5, void *stream) { + vmax_deep_merged_kernel<<<1, nullptr, stream>>>( + (__gm__ float *)p0, + (__gm__ float *)p1, + (__gm__ float *)p2, + (__gm__ float *)p3, + (__gm__ float *)p4, + (__gm__ float *)p5); } diff --git a/test/vpto/cases/micro-op/binary-vector/vmax/main.cpp b/test/vpto/cases/micro-op/binary-vector/vmax/main.cpp index 9713fd509..744b380c4 100644 --- a/test/vpto/cases/micro-op/binary-vector/vmax/main.cpp +++ b/test/vpto/cases/micro-op/binary-vector/vmax/main.cpp @@ -5,90 +5,89 @@ // THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, // INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. // See LICENSE in the root of the software repository for the full text of the License. - -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -*/ +// Merged vmax test case. #include "test_common.h" #include "acl/acl.h" #include #include - +#include using namespace PtoTestCommon; -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, \ - (int)_ret, __FILE__, __LINE__); \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) +#define ACL_CHECK(expr) do { const aclError _r=(expr); if(_r!=ACL_SUCCESS){std::fprintf(stderr,"[ERROR] %s:%d acle=%d\n",#expr,__LINE__,(int)_r);rc=1;goto cleanup;} }while(0) +#define FCK(expr,path) do{if(!(expr)){std::fprintf(stderr,"[ERROR] file:%s\n",path);rc=1;goto cleanup;}}while(0) -void LaunchMax_kernel_2d(float *v1, float *v2, float *v3, void *stream); +void LaunchVmaxDeepMerged(float * p0, float * p1, float * p2, float * p3, float * p4, float * p5, void *stream); int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(float); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(float); - size_t elemCount_v3 = 1024; - size_t fileSize_v3 = elemCount_v3 * sizeof(float); - float *v1Host = nullptr; - float *v1Device = nullptr; - float *v2Host = nullptr; - float *v2Device = nullptr; - float *v3Host = nullptr; - float *v3Device = nullptr; - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; + constexpr size_t SZ_f32 = 4096; + constexpr size_t SZ_tail = 4096; - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) - deviceId = std::atoi(envDevice); - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; + float *h_f32_v1=nullptr, *d_f32_v1=nullptr; + float *h_f32_v2=nullptr, *d_f32_v2=nullptr; + float *h_f32_v3=nullptr, *d_f32_v3=nullptr; + float *h_tail_v1=nullptr, *d_tail_v1=nullptr; + float *h_tail_v2=nullptr, *d_tail_v2=nullptr; + float *h_tail_v3=nullptr, *d_tail_v3=nullptr; + int rc=0; bool aclInited=false,deviceSet=false; int deviceId=0; aclrtStream stream=nullptr; size_t fsz=0; + ACL_CHECK(aclInit(nullptr)); aclInited=true; + if(const char*e=std::getenv("ACL_DEVICE_ID")) deviceId=std::atoi(e); + ACL_CHECK(aclrtSetDevice(deviceId)); deviceSet=true; ACL_CHECK(aclrtCreateStream(&stream)); - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMallocHost((void **)(&v3Host), fileSize_v3)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v3Device, fileSize_v3, ACL_MEM_MALLOC_HUGE_FIRST)); - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ReadFile("./v3.bin", fileSize_v3, v3Host, fileSize_v3); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v3Device, fileSize_v3, v3Host, fileSize_v3, - ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchMax_kernel_2d(v1Device, v2Device, v3Device, stream); + ACL_CHECK(aclrtMallocHost((void**)&h_f32_v1,SZ_f32)); + ACL_CHECK(aclrtMallocHost((void**)&h_f32_v2,SZ_f32)); + ACL_CHECK(aclrtMallocHost((void**)&h_f32_v3,SZ_f32)); + ACL_CHECK(aclrtMallocHost((void**)&h_tail_v1,SZ_tail)); + ACL_CHECK(aclrtMallocHost((void**)&h_tail_v2,SZ_tail)); + ACL_CHECK(aclrtMallocHost((void**)&h_tail_v3,SZ_tail)); + ACL_CHECK(aclrtMalloc((void**)&d_f32_v1,SZ_f32,ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc((void**)&d_f32_v2,SZ_f32,ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc((void**)&d_f32_v3,SZ_f32,ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc((void**)&d_tail_v1,SZ_tail,ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc((void**)&d_tail_v2,SZ_tail,ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc((void**)&d_tail_v3,SZ_tail,ACL_MEM_MALLOC_HUGE_FIRST)); + fsz=SZ_f32; FCK(ReadFile("v1.bin",fsz,h_f32_v1,SZ_f32)&&fsz==SZ_f32,"v1.bin"); + fsz=SZ_f32; FCK(ReadFile("v2.bin",fsz,h_f32_v2,SZ_f32)&&fsz==SZ_f32,"v2.bin"); + fsz=SZ_f32; FCK(ReadFile("v3.bin",fsz,h_f32_v3,SZ_f32)&&fsz==SZ_f32,"v3.bin"); + fsz=SZ_tail; FCK(ReadFile("v1_tail.bin",fsz,h_tail_v1,SZ_tail)&&fsz==SZ_tail,"v1_tail.bin"); + fsz=SZ_tail; FCK(ReadFile("v2_tail.bin",fsz,h_tail_v2,SZ_tail)&&fsz==SZ_tail,"v2_tail.bin"); + fsz=SZ_tail; FCK(ReadFile("v3_tail.bin",fsz,h_tail_v3,SZ_tail)&&fsz==SZ_tail,"v3_tail.bin"); + ACL_CHECK(aclrtMemcpy(d_f32_v1,SZ_f32,h_f32_v1,SZ_f32,ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(d_f32_v2,SZ_f32,h_f32_v2,SZ_f32,ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(d_f32_v3,SZ_f32,h_f32_v3,SZ_f32,ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(d_tail_v1,SZ_tail,h_tail_v1,SZ_tail,ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(d_tail_v2,SZ_tail,h_tail_v2,SZ_tail,ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(d_tail_v3,SZ_tail,h_tail_v3,SZ_tail,ACL_MEMCPY_HOST_TO_DEVICE)); + LaunchVmaxDeepMerged( + d_f32_v1, + d_f32_v2, + d_f32_v3, + d_tail_v1, + d_tail_v2, + d_tail_v3, + stream + ); ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v3Host, fileSize_v3, v3Device, fileSize_v3, - ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v3.bin", v3Host, fileSize_v3); + ACL_CHECK(aclrtMemcpy(h_f32_v3,SZ_f32,d_f32_v3,SZ_f32,ACL_MEMCPY_DEVICE_TO_HOST)); + ACL_CHECK(aclrtMemcpy(h_tail_v3,SZ_tail,d_tail_v3,SZ_tail,ACL_MEMCPY_DEVICE_TO_HOST)); + FCK(WriteFile("v3.bin",h_f32_v3,SZ_f32),"v3.bin"); + FCK(WriteFile("v3_tail.bin",h_tail_v3,SZ_tail),"v3_tail.bin"); cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFree(v3Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - aclrtFreeHost(v3Host); - if (stream != nullptr) - aclrtDestroyStream(stream); - if (deviceSet) - aclrtResetDevice(deviceId); - if (aclInited) - aclFinalize(); + aclrtFree(d_f32_v1); + aclrtFree(d_f32_v2); + aclrtFree(d_f32_v3); + aclrtFree(d_tail_v1); + aclrtFree(d_tail_v2); + aclrtFree(d_tail_v3); + aclrtFreeHost(h_f32_v1); + aclrtFreeHost(h_f32_v2); + aclrtFreeHost(h_f32_v3); + aclrtFreeHost(h_tail_v1); + aclrtFreeHost(h_tail_v2); + aclrtFreeHost(h_tail_v3); + if(stream) aclrtDestroyStream(stream); + if(deviceSet) aclrtResetDevice(deviceId); + if(aclInited) aclFinalize(); return rc; } diff --git a/test/vpto/cases/micro-op/binary-vector/vmin-bf16/compare.py b/test/vpto/cases/micro-op/binary-vector/vmin-bf16/compare.py deleted file mode 100755 index 8e84eda9e..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vmin-bf16/compare.py +++ /dev/null @@ -1,42 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/binary-vector/vmin-bf16 -# family: binary-vector -# target_ops: pto.vmin -# scenarios: core-bf16, full-mask -import os -import sys -import numpy as np - - -def compare_bin_prefix(golden_path, output_path, dtype, eps, count): - if not os.path.exists(golden_path) or not os.path.exists(output_path): - return False - golden = np.fromfile(golden_path, dtype=dtype, count=count) - output = np.fromfile(output_path, dtype=dtype, count=count) - return golden.shape == output.shape and np.allclose( - golden, output, atol=eps, rtol=eps, equal_nan=True - ) - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_bin_prefix("golden_v3.bin", "v3.bin", np.uint16, 0, 1024) - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/binary-vector/vmin-bf16/golden.py b/test/vpto/cases/micro-op/binary-vector/vmin-bf16/golden.py deleted file mode 100755 index a399eeb9d..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vmin-bf16/golden.py +++ /dev/null @@ -1,61 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/binary-vector/vmin-bf16 -# family: binary-vector -# target_ops: pto.vmin -# scenarios: core-bf16, full-mask -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -ELEMS = 1024 -SEED = 19 - - -def f32_to_bf16_bits(values: np.ndarray) -> np.ndarray: - wide = values.astype(np.float32, copy=False).view(np.uint32) - rounding = np.uint32(0x7FFF) + ((wide >> 16) & np.uint32(1)) - return ((wide + rounding) >> 16).astype(np.uint16) - - -def bf16_bits_to_f32(bits: np.ndarray) -> np.ndarray: - return (bits.astype(np.uint32) << 16).view(np.float32) - - -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) - v1_f32 = rng.uniform(-4.0, 4.0, size=ELEMS).astype(np.float32) - v2_f32 = rng.uniform(-4.0, 4.0, size=ELEMS).astype(np.float32) - v1 = f32_to_bf16_bits(v1_f32) - v2 = f32_to_bf16_bits(v2_f32) - v3 = np.zeros(ELEMS, dtype=np.uint16) - golden_v3 = f32_to_bf16_bits(np.minimum(bf16_bits_to_f32(v1), bf16_bits_to_f32(v2))) - - output_dir.mkdir(parents=True, exist_ok=True) - v1.tofile(output_dir / "v1.bin") - v2.tofile(output_dir / "v2.bin") - v3.tofile(output_dir / "v3.bin") - golden_v3.tofile(output_dir / "golden_v3.bin") - - -def main() -> None: - parser = argparse.ArgumentParser() - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/binary-vector/vmin-bf16/kernel.pto b/test/vpto/cases/micro-op/binary-vector/vmin-bf16/kernel.pto deleted file mode 100644 index 046c88d23..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vmin-bf16/kernel.pto +++ /dev/null @@ -1,53 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/binary-vector/vmin-bf16 -// family: binary-vector -// target_ops: pto.vmin -// scenarios: core-bf16, full-mask -// ----------------------------------------------------------------------------- -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vmin_bf16_kernel(%arg0: !pto.ptr, %arg1: !pto.ptr, - %arg2: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1024 = arith.constant 1024 : index - %c128 = arith.constant 128 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c64_i64 = arith.constant 64 : i64 - %c2048_i64 = arith.constant 2048 : i64 - %c4096_i64 = arith.constant 4096 : i64 - - %ub_lhs = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_rhs = pto.castptr %c2048_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c4096_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_lhs, %c0_i64, %c64_i64 - nburst(%c32_i64, %c64_i64, %c64_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - pto.mte_gm_ub %arg1, %ub_rhs, %c0_i64, %c64_i64 - nburst(%c32_i64, %c64_i64, %c64_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %mask = pto.pset_b16 "PAT_ALL" : !pto.mask - scf.for %offset = %c0 to %c1024 step %c128 { - %lhs = pto.vlds %ub_lhs[%offset] : !pto.ptr -> !pto.vreg<128xbf16> - %rhs = pto.vlds %ub_rhs[%offset] : !pto.ptr -> !pto.vreg<128xbf16> - %out = pto.vmin %lhs, %rhs, %mask : !pto.vreg<128xbf16>, !pto.vreg<128xbf16>, !pto.mask -> !pto.vreg<128xbf16> - pto.vsts %out, %ub_out[%offset], %mask : !pto.vreg<128xbf16>, !pto.ptr, !pto.mask - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg2, %c64_i64 - nburst(%c32_i64, %c64_i64, %c64_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/binary-vector/vmin-bf16/launch.cpp b/test/vpto/cases/micro-op/binary-vector/vmin-bf16/launch.cpp deleted file mode 100644 index 1f374fb36..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vmin-bf16/launch.cpp +++ /dev/null @@ -1,52 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/binary-vector/vmin-bf16 -// family: binary-vector -// target_ops: pto.vmin -// scenarios: core-bf16, full-mask -// ----------------------------------------------------------------------------- -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vmin_bf16_kernel(__gm__ bfloat16_t *v1, - __gm__ bfloat16_t *v2, - __gm__ bfloat16_t *v3); - -void LaunchVmin_bf16_kernel(uint16_t *v1, uint16_t *v2, uint16_t *v3, void *stream) { - vmin_bf16_kernel<<<1, nullptr, stream>>>((__gm__ bfloat16_t *)v1, - (__gm__ bfloat16_t *)v2, - (__gm__ bfloat16_t *)v3); -} diff --git a/test/vpto/cases/micro-op/binary-vector/vmin-bf16/main.cpp b/test/vpto/cases/micro-op/binary-vector/vmin-bf16/main.cpp deleted file mode 100644 index 01fb803f3..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vmin-bf16/main.cpp +++ /dev/null @@ -1,102 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/binary-vector/vmin-bf16 -// family: binary-vector -// target_ops: pto.vmin -// scenarios: core-bf16, full-mask -// ----------------------------------------------------------------------------- -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include -#include - -using namespace PtoTestCommon; - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, \ - (int)_ret, __FILE__, __LINE__); \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVmin_bf16_kernel(uint16_t *v1, uint16_t *v2, uint16_t *v3, - void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(uint16_t); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(uint16_t); - size_t elemCount_v3 = 1024; - size_t fileSize_v3 = elemCount_v3 * sizeof(uint16_t); - uint16_t *v1Host = nullptr; - uint16_t *v1Device = nullptr; - uint16_t *v2Host = nullptr; - uint16_t *v2Device = nullptr; - uint16_t *v3Host = nullptr; - uint16_t *v3Device = nullptr; - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) - deviceId = std::atoi(envDevice); - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMallocHost((void **)(&v3Host), fileSize_v3)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v3Device, fileSize_v3, ACL_MEM_MALLOC_HUGE_FIRST)); - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ReadFile("./v3.bin", fileSize_v3, v3Host, fileSize_v3); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v3Device, fileSize_v3, v3Host, fileSize_v3, - ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVmin_bf16_kernel(v1Device, v2Device, v3Device, stream); - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v3Host, fileSize_v3, v3Device, fileSize_v3, - ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v3.bin", v3Host, fileSize_v3); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFree(v3Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - aclrtFreeHost(v3Host); - if (stream != nullptr) - aclrtDestroyStream(stream); - if (deviceSet) - aclrtResetDevice(deviceId); - if (aclInited) - aclFinalize(); - return rc; -} diff --git a/test/vpto/cases/micro-op/binary-vector/vmin-f16/compare.py b/test/vpto/cases/micro-op/binary-vector/vmin-f16/compare.py deleted file mode 100755 index d4fe300db..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vmin-f16/compare.py +++ /dev/null @@ -1,42 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/binary-vector/vmin-f16 -# family: binary-vector -# target_ops: pto.vmin -# scenarios: core-f16, full-mask -import os -import sys -import numpy as np - - -def compare_bin_prefix(golden_path, output_path, dtype, eps, count): - if not os.path.exists(golden_path) or not os.path.exists(output_path): - return False - golden = np.fromfile(golden_path, dtype=dtype, count=count) - output = np.fromfile(output_path, dtype=dtype, count=count) - return golden.shape == output.shape and np.allclose( - golden, output, atol=eps, rtol=eps, equal_nan=True - ) - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_bin_prefix("golden_v3.bin", "v3.bin", np.float16, 5e-3, 1024) - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/binary-vector/vmin-f16/golden.py b/test/vpto/cases/micro-op/binary-vector/vmin-f16/golden.py deleted file mode 100755 index 4ee39a4b3..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vmin-f16/golden.py +++ /dev/null @@ -1,48 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/binary-vector/vmin-f16 -# family: binary-vector -# target_ops: pto.vmin -# scenarios: core-f16, full-mask -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -ROWS = 32 -COLS = 32 -SEED = 19 -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) - v1 = rng.uniform(-8.0, 8.0, size=(ROWS, COLS)).astype(np.float16) - v2 = rng.uniform(-8.0, 8.0, size=(ROWS, COLS)).astype(np.float16) - v3 = np.zeros((ROWS, COLS), dtype=np.float16) - golden_v3 = np.minimum(v1.astype(np.float32), v2.astype(np.float32)).astype(np.float16) - - output_dir.mkdir(parents=True, exist_ok=True) - v1.reshape(-1).tofile(output_dir / "v1.bin") - v2.reshape(-1).tofile(output_dir / "v2.bin") - v3.reshape(-1).tofile(output_dir / "v3.bin") - golden_v3.reshape(-1).tofile(output_dir / "golden_v3.bin") - - -def main() -> None: - parser = argparse.ArgumentParser() - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/binary-vector/vmin-f16/kernel.pto b/test/vpto/cases/micro-op/binary-vector/vmin-f16/kernel.pto deleted file mode 100644 index 05368522d..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vmin-f16/kernel.pto +++ /dev/null @@ -1,53 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/binary-vector/vmin-f16 -// family: binary-vector -// target_ops: pto.vmin -// scenarios: core-f16, full-mask -// ----------------------------------------------------------------------------- -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vmin_f16_kernel(%arg0: !pto.ptr, %arg1: !pto.ptr, - %arg2: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1024 = arith.constant 1024 : index - %c128 = arith.constant 128 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c64_i64 = arith.constant 64 : i64 - %c2048_i64 = arith.constant 2048 : i64 - %c4096_i64 = arith.constant 4096 : i64 - - %ub_lhs = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_rhs = pto.castptr %c2048_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c4096_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_lhs, %c0_i64, %c64_i64 - nburst(%c32_i64, %c64_i64, %c64_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - pto.mte_gm_ub %arg1, %ub_rhs, %c0_i64, %c64_i64 - nburst(%c32_i64, %c64_i64, %c64_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %mask = pto.pset_b16 "PAT_ALL" : !pto.mask - scf.for %offset = %c0 to %c1024 step %c128 { - %lhs = pto.vlds %ub_lhs[%offset] : !pto.ptr -> !pto.vreg<128xf16> - %rhs = pto.vlds %ub_rhs[%offset] : !pto.ptr -> !pto.vreg<128xf16> - %out = pto.vmin %lhs, %rhs, %mask : !pto.vreg<128xf16>, !pto.vreg<128xf16>, !pto.mask -> !pto.vreg<128xf16> - pto.vsts %out, %ub_out[%offset], %mask : !pto.vreg<128xf16>, !pto.ptr, !pto.mask - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg2, %c64_i64 - nburst(%c32_i64, %c64_i64, %c64_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/binary-vector/vmin-f16/launch.cpp b/test/vpto/cases/micro-op/binary-vector/vmin-f16/launch.cpp deleted file mode 100644 index 5151be743..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vmin-f16/launch.cpp +++ /dev/null @@ -1,52 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/binary-vector/vmin-f16 -// family: binary-vector -// target_ops: pto.vmin -// scenarios: core-f16, full-mask -// ----------------------------------------------------------------------------- -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vmin_f16_kernel(__gm__ half *v1, - __gm__ half *v2, - __gm__ half *v3); - -void LaunchVmin_f16_kernel(uint16_t *v1, uint16_t *v2, uint16_t *v3, void *stream) { - vmin_f16_kernel<<<1, nullptr, stream>>>((__gm__ half *)v1, - (__gm__ half *)v2, - (__gm__ half *)v3); -} diff --git a/test/vpto/cases/micro-op/binary-vector/vmin-f16/main.cpp b/test/vpto/cases/micro-op/binary-vector/vmin-f16/main.cpp deleted file mode 100644 index 374d8a322..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vmin-f16/main.cpp +++ /dev/null @@ -1,102 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/binary-vector/vmin-f16 -// family: binary-vector -// target_ops: pto.vmin -// scenarios: core-f16, full-mask -// ----------------------------------------------------------------------------- -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include -#include - -using namespace PtoTestCommon; - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, \ - (int)_ret, __FILE__, __LINE__); \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVmin_f16_kernel(uint16_t *v1, uint16_t *v2, uint16_t *v3, - void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(uint16_t); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(uint16_t); - size_t elemCount_v3 = 1024; - size_t fileSize_v3 = elemCount_v3 * sizeof(uint16_t); - uint16_t *v1Host = nullptr; - uint16_t *v1Device = nullptr; - uint16_t *v2Host = nullptr; - uint16_t *v2Device = nullptr; - uint16_t *v3Host = nullptr; - uint16_t *v3Device = nullptr; - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) - deviceId = std::atoi(envDevice); - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMallocHost((void **)(&v3Host), fileSize_v3)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v3Device, fileSize_v3, ACL_MEM_MALLOC_HUGE_FIRST)); - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ReadFile("./v3.bin", fileSize_v3, v3Host, fileSize_v3); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v3Device, fileSize_v3, v3Host, fileSize_v3, - ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVmin_f16_kernel(v1Device, v2Device, v3Device, stream); - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v3Host, fileSize_v3, v3Device, fileSize_v3, - ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v3.bin", v3Host, fileSize_v3); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFree(v3Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - aclrtFreeHost(v3Host); - if (stream != nullptr) - aclrtDestroyStream(stream); - if (deviceSet) - aclrtResetDevice(deviceId); - if (aclInited) - aclFinalize(); - return rc; -} diff --git a/test/vpto/cases/micro-op/binary-vector/vmin-f32-exceptional/compare.py b/test/vpto/cases/micro-op/binary-vector/vmin-f32-exceptional/compare.py deleted file mode 100644 index a5f14dabc..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vmin-f32-exceptional/compare.py +++ /dev/null @@ -1,37 +0,0 @@ -#!/usr/bin/python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - - -import os -import sys -import numpy as np - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(golden_path) or not os.path.exists(output_path): - return False - golden = np.fromfile(golden_path, dtype=dtype) - output = np.fromfile(output_path, dtype=dtype) - return golden.shape == output.shape and np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True) - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_bin("golden_v3.bin", "v3.bin", np.float32, 1e-4) - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/binary-vector/vmin-f32-exceptional/golden.py b/test/vpto/cases/micro-op/binary-vector/vmin-f32-exceptional/golden.py deleted file mode 100644 index 4d8d2f34a..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vmin-f32-exceptional/golden.py +++ /dev/null @@ -1,54 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -ROWS = 32 -COLS = 32 -SEED = 19 - - -def generate(output_dir: Path, seed: int) -> None: - del seed - lhs = np.array( - [-np.inf, -7.5, -0.0, 0.0, 1.0, np.inf, np.nan, 3.5], - dtype=np.float32, - ) - rhs = np.array( - [np.inf, -2.5, 0.0, -0.0, -1.0, 1.0, 1.0, np.nan], - dtype=np.float32, - ) - v1 = np.resize(lhs, ROWS * COLS).reshape(ROWS, COLS).astype(np.float32) - v2 = np.resize(rhs, ROWS * COLS).reshape(ROWS, COLS).astype(np.float32) - v3 = np.zeros((ROWS, COLS), dtype=np.float32) - golden_v3 = np.minimum(v1, v2).astype(np.float32, copy=False) - - output_dir.mkdir(parents=True, exist_ok=True) - v1.reshape(-1).tofile(output_dir / "v1.bin") - v2.reshape(-1).tofile(output_dir / "v2.bin") - v3.reshape(-1).tofile(output_dir / "v3.bin") - golden_v3.reshape(-1).tofile(output_dir / "golden_v3.bin") - - -def main() -> None: - parser = argparse.ArgumentParser() - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/binary-vector/vmin-f32-exceptional/kernel.pto b/test/vpto/cases/micro-op/binary-vector/vmin-f32-exceptional/kernel.pto deleted file mode 100644 index cfbd2a17e..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vmin-f32-exceptional/kernel.pto +++ /dev/null @@ -1,50 +0,0 @@ -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @min_kernel_2d(%arg0: !pto.ptr, %arg1: !pto.ptr, - %arg2: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c64 = arith.constant 64 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c128_i64 = arith.constant 128 : i64 - %c4096_i64 = arith.constant 4096 : i64 - %c8192_i64 = arith.constant 8192 : i64 - %c1024_i32 = arith.constant 1024 : i32 - - %ub_lhs = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_rhs = pto.castptr %c4096_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c8192_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_lhs, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - pto.mte_gm_ub %arg1, %ub_rhs, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %_:1 = scf.for %offset = %c0 to %c1024 step %c64 iter_args(%remaining = %c1024_i32) -> (i32) { - %mask, %next_remaining = pto.plt_b32 %remaining : i32 -> !pto.mask, i32 - %lhs = pto.vlds %ub_lhs[%offset] : !pto.ptr -> !pto.vreg<64xf32> - %rhs = pto.vlds %ub_rhs[%offset] : !pto.ptr -> !pto.vreg<64xf32> - %out = pto.vmin %lhs, %rhs, %mask : !pto.vreg<64xf32>, !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> - pto.vsts %out, %ub_out[%offset], %mask : !pto.vreg<64xf32>, !pto.ptr, !pto.mask - scf.yield %next_remaining : i32 - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg2, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/binary-vector/vmin-f32-exceptional/launch.cpp b/test/vpto/cases/micro-op/binary-vector/vmin-f32-exceptional/launch.cpp deleted file mode 100644 index f2c64c6a6..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vmin-f32-exceptional/launch.cpp +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void min_kernel_2d(__gm__ float *v1, - __gm__ float *v2, - __gm__ float *v3); - -void LaunchMin_kernel_2d(float *v1, float *v2, float *v3, void *stream) { - min_kernel_2d<<<1, nullptr, stream>>>((__gm__ float *)v1, - (__gm__ float *)v2, - (__gm__ float *)v3); -} diff --git a/test/vpto/cases/micro-op/binary-vector/vmin-f32-exceptional/main.cpp b/test/vpto/cases/micro-op/binary-vector/vmin-f32-exceptional/main.cpp deleted file mode 100644 index b952b76a0..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vmin-f32-exceptional/main.cpp +++ /dev/null @@ -1,94 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include - -using namespace PtoTestCommon; - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, \ - (int)_ret, __FILE__, __LINE__); \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchMin_kernel_2d(float *v1, float *v2, float *v3, void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(float); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(float); - size_t elemCount_v3 = 1024; - size_t fileSize_v3 = elemCount_v3 * sizeof(float); - float *v1Host = nullptr; - float *v1Device = nullptr; - float *v2Host = nullptr; - float *v2Device = nullptr; - float *v3Host = nullptr; - float *v3Device = nullptr; - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) - deviceId = std::atoi(envDevice); - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMallocHost((void **)(&v3Host), fileSize_v3)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v3Device, fileSize_v3, ACL_MEM_MALLOC_HUGE_FIRST)); - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ReadFile("./v3.bin", fileSize_v3, v3Host, fileSize_v3); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v3Device, fileSize_v3, v3Host, fileSize_v3, - ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchMin_kernel_2d(v1Device, v2Device, v3Device, stream); - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v3Host, fileSize_v3, v3Device, fileSize_v3, - ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v3.bin", v3Host, fileSize_v3); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFree(v3Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - aclrtFreeHost(v3Host); - if (stream != nullptr) - aclrtDestroyStream(stream); - if (deviceSet) - aclrtResetDevice(deviceId); - if (aclInited) - aclFinalize(); - return rc; -} diff --git a/test/vpto/cases/micro-op/binary-vector/vmin-i16-signed/compare.py b/test/vpto/cases/micro-op/binary-vector/vmin-i16-signed/compare.py deleted file mode 100755 index 2afc3f8ec..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vmin-i16-signed/compare.py +++ /dev/null @@ -1,42 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/binary-vector/vmin-i16-signed -# family: binary-vector -# target_ops: pto.vmin -# scenarios: core-i16-signed, full-mask -import os -import sys -import numpy as np - - -def compare_bin_prefix(golden_path, output_path, dtype, eps, count): - if not os.path.exists(golden_path) or not os.path.exists(output_path): - return False - golden = np.fromfile(golden_path, dtype=dtype, count=count) - output = np.fromfile(output_path, dtype=dtype, count=count) - return golden.shape == output.shape and np.allclose( - golden, output, atol=eps, rtol=eps, equal_nan=True - ) - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_bin_prefix("golden_v3.bin", "v3.bin", np.int16, 0, 1024) - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/binary-vector/vmin-i16-signed/golden.py b/test/vpto/cases/micro-op/binary-vector/vmin-i16-signed/golden.py deleted file mode 100755 index 48ce71042..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vmin-i16-signed/golden.py +++ /dev/null @@ -1,49 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/binary-vector/vmin-i16-signed -# family: binary-vector -# target_ops: pto.vmin -# scenarios: core-i16-signed, full-mask -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -ELEMS = 1024 -SEED = 19 - - -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) - v1 = rng.integers(-1000, 1001, size=ELEMS, dtype=np.int16) - v2 = rng.integers(-1000, 1001, size=ELEMS, dtype=np.int16) - v3 = np.zeros(ELEMS, dtype=np.int16) - golden_v3 = np.minimum(v1, v2) - - output_dir.mkdir(parents=True, exist_ok=True) - v1.tofile(output_dir / "v1.bin") - v2.tofile(output_dir / "v2.bin") - v3.tofile(output_dir / "v3.bin") - golden_v3.tofile(output_dir / "golden_v3.bin") - - -def main() -> None: - parser = argparse.ArgumentParser() - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/binary-vector/vmin-i16-signed/kernel.pto b/test/vpto/cases/micro-op/binary-vector/vmin-i16-signed/kernel.pto deleted file mode 100644 index 3a37e8baa..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vmin-i16-signed/kernel.pto +++ /dev/null @@ -1,53 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/binary-vector/vmin-i16-signed -// family: binary-vector -// target_ops: pto.vmin -// scenarios: core-i16-signed, full-mask -// ----------------------------------------------------------------------------- -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vmin_i16_signed_kernel(%arg0: !pto.ptr, %arg1: !pto.ptr, - %arg2: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1024 = arith.constant 1024 : index - %c128 = arith.constant 128 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c64_i64 = arith.constant 64 : i64 - %c2048_i64 = arith.constant 2048 : i64 - %c4096_i64 = arith.constant 4096 : i64 - - %ub_lhs = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_rhs = pto.castptr %c2048_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c4096_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_lhs, %c0_i64, %c64_i64 - nburst(%c32_i64, %c64_i64, %c64_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - pto.mte_gm_ub %arg1, %ub_rhs, %c0_i64, %c64_i64 - nburst(%c32_i64, %c64_i64, %c64_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %mask = pto.pset_b16 "PAT_ALL" : !pto.mask - scf.for %offset = %c0 to %c1024 step %c128 { - %lhs = pto.vlds %ub_lhs[%offset] : !pto.ptr -> !pto.vreg<128xi16> - %rhs = pto.vlds %ub_rhs[%offset] : !pto.ptr -> !pto.vreg<128xi16> - %out = pto.vmin %lhs, %rhs, %mask : !pto.vreg<128xi16>, !pto.vreg<128xi16>, !pto.mask -> !pto.vreg<128xi16> - pto.vsts %out, %ub_out[%offset], %mask : !pto.vreg<128xi16>, !pto.ptr, !pto.mask - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg2, %c64_i64 - nburst(%c32_i64, %c64_i64, %c64_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/binary-vector/vmin-i16-signed/launch.cpp b/test/vpto/cases/micro-op/binary-vector/vmin-i16-signed/launch.cpp deleted file mode 100644 index 923e415d4..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vmin-i16-signed/launch.cpp +++ /dev/null @@ -1,53 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/binary-vector/vmin-i16-signed -// family: binary-vector -// target_ops: pto.vmin -// scenarios: core-i16-signed, full-mask -// ----------------------------------------------------------------------------- -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vmin_i16_signed_kernel(__gm__ int16_t *v1, - __gm__ int16_t *v2, - __gm__ int16_t *v3); - -void LaunchVmin_i16_signed_kernel(int16_t *v1, int16_t *v2, int16_t *v3, - void *stream) { - vmin_i16_signed_kernel<<<1, nullptr, stream>>>((__gm__ int16_t *)v1, - (__gm__ int16_t *)v2, - (__gm__ int16_t *)v3); -} diff --git a/test/vpto/cases/micro-op/binary-vector/vmin-i16-signed/main.cpp b/test/vpto/cases/micro-op/binary-vector/vmin-i16-signed/main.cpp deleted file mode 100644 index 029455a99..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vmin-i16-signed/main.cpp +++ /dev/null @@ -1,101 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/binary-vector/vmin-i16-signed -// family: binary-vector -// target_ops: pto.vmin -// scenarios: core-i16-signed, full-mask -// ----------------------------------------------------------------------------- -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include - -using namespace PtoTestCommon; - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, \ - (int)_ret, __FILE__, __LINE__); \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVmin_i16_signed_kernel(int16_t *v1, int16_t *v2, int16_t *v3, - void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(int16_t); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(int16_t); - size_t elemCount_v3 = 1024; - size_t fileSize_v3 = elemCount_v3 * sizeof(int16_t); - int16_t *v1Host = nullptr; - int16_t *v1Device = nullptr; - int16_t *v2Host = nullptr; - int16_t *v2Device = nullptr; - int16_t *v3Host = nullptr; - int16_t *v3Device = nullptr; - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) - deviceId = std::atoi(envDevice); - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMallocHost((void **)(&v3Host), fileSize_v3)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v3Device, fileSize_v3, ACL_MEM_MALLOC_HUGE_FIRST)); - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ReadFile("./v3.bin", fileSize_v3, v3Host, fileSize_v3); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v3Device, fileSize_v3, v3Host, fileSize_v3, - ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVmin_i16_signed_kernel(v1Device, v2Device, v3Device, stream); - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v3Host, fileSize_v3, v3Device, fileSize_v3, - ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v3.bin", v3Host, fileSize_v3); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFree(v3Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - aclrtFreeHost(v3Host); - if (stream != nullptr) - aclrtDestroyStream(stream); - if (deviceSet) - aclrtResetDevice(deviceId); - if (aclInited) - aclFinalize(); - return rc; -} diff --git a/test/vpto/cases/micro-op/binary-vector/vmin-i16-unsigned/compare.py b/test/vpto/cases/micro-op/binary-vector/vmin-i16-unsigned/compare.py deleted file mode 100755 index f87d0f17d..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vmin-i16-unsigned/compare.py +++ /dev/null @@ -1,42 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/binary-vector/vmin-i16-unsigned -# family: binary-vector -# target_ops: pto.vmin -# scenarios: core-i16-unsigned, full-mask -import os -import sys -import numpy as np - - -def compare_bin_prefix(golden_path, output_path, dtype, eps, count): - if not os.path.exists(golden_path) or not os.path.exists(output_path): - return False - golden = np.fromfile(golden_path, dtype=dtype, count=count) - output = np.fromfile(output_path, dtype=dtype, count=count) - return golden.shape == output.shape and np.allclose( - golden, output, atol=eps, rtol=eps, equal_nan=True - ) - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_bin_prefix("golden_v3.bin", "v3.bin", np.uint16, 0, 1024) - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/binary-vector/vmin-i16-unsigned/golden.py b/test/vpto/cases/micro-op/binary-vector/vmin-i16-unsigned/golden.py deleted file mode 100755 index 7ac5b68a6..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vmin-i16-unsigned/golden.py +++ /dev/null @@ -1,49 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/binary-vector/vmin-i16-unsigned -# family: binary-vector -# target_ops: pto.vmin -# scenarios: core-i16-unsigned, full-mask -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -ELEMS = 1024 -SEED = 19 - - -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) - v1 = rng.integers(0, 2001, size=ELEMS, dtype=np.uint16) - v2 = rng.integers(0, 2001, size=ELEMS, dtype=np.uint16) - v3 = np.zeros(ELEMS, dtype=np.uint16) - golden_v3 = np.minimum(v1, v2) - - output_dir.mkdir(parents=True, exist_ok=True) - v1.tofile(output_dir / "v1.bin") - v2.tofile(output_dir / "v2.bin") - v3.tofile(output_dir / "v3.bin") - golden_v3.tofile(output_dir / "golden_v3.bin") - - -def main() -> None: - parser = argparse.ArgumentParser() - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/binary-vector/vmin-i16-unsigned/kernel.pto b/test/vpto/cases/micro-op/binary-vector/vmin-i16-unsigned/kernel.pto deleted file mode 100644 index 0dce887df..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vmin-i16-unsigned/kernel.pto +++ /dev/null @@ -1,53 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/binary-vector/vmin-i16-unsigned -// family: binary-vector -// target_ops: pto.vmin -// scenarios: core-i16-unsigned, full-mask -// ----------------------------------------------------------------------------- -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vmin_i16_unsigned_kernel(%arg0: !pto.ptr, %arg1: !pto.ptr, - %arg2: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1024 = arith.constant 1024 : index - %c128 = arith.constant 128 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c64_i64 = arith.constant 64 : i64 - %c2048_i64 = arith.constant 2048 : i64 - %c4096_i64 = arith.constant 4096 : i64 - - %ub_lhs = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_rhs = pto.castptr %c2048_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c4096_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_lhs, %c0_i64, %c64_i64 - nburst(%c32_i64, %c64_i64, %c64_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - pto.mte_gm_ub %arg1, %ub_rhs, %c0_i64, %c64_i64 - nburst(%c32_i64, %c64_i64, %c64_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %mask = pto.pset_b16 "PAT_ALL" : !pto.mask - scf.for %offset = %c0 to %c1024 step %c128 { - %lhs = pto.vlds %ub_lhs[%offset] : !pto.ptr -> !pto.vreg<128xui16> - %rhs = pto.vlds %ub_rhs[%offset] : !pto.ptr -> !pto.vreg<128xui16> - %out = pto.vmin %lhs, %rhs, %mask : !pto.vreg<128xui16>, !pto.vreg<128xui16>, !pto.mask -> !pto.vreg<128xui16> - pto.vsts %out, %ub_out[%offset], %mask : !pto.vreg<128xui16>, !pto.ptr, !pto.mask - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg2, %c64_i64 - nburst(%c32_i64, %c64_i64, %c64_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/binary-vector/vmin-i16-unsigned/launch.cpp b/test/vpto/cases/micro-op/binary-vector/vmin-i16-unsigned/launch.cpp deleted file mode 100644 index 6cc3d692c..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vmin-i16-unsigned/launch.cpp +++ /dev/null @@ -1,53 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/binary-vector/vmin-i16-unsigned -// family: binary-vector -// target_ops: pto.vmin -// scenarios: core-i16-unsigned, full-mask -// ----------------------------------------------------------------------------- -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vmin_i16_unsigned_kernel(__gm__ uint16_t *v1, - __gm__ uint16_t *v2, - __gm__ uint16_t *v3); - -void LaunchVmin_i16_unsigned_kernel(uint16_t *v1, uint16_t *v2, uint16_t *v3, - void *stream) { - vmin_i16_unsigned_kernel<<<1, nullptr, stream>>>((__gm__ uint16_t *)v1, - (__gm__ uint16_t *)v2, - (__gm__ uint16_t *)v3); -} diff --git a/test/vpto/cases/micro-op/binary-vector/vmin-i16-unsigned/main.cpp b/test/vpto/cases/micro-op/binary-vector/vmin-i16-unsigned/main.cpp deleted file mode 100644 index 885dea67a..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vmin-i16-unsigned/main.cpp +++ /dev/null @@ -1,101 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/binary-vector/vmin-i16-unsigned -// family: binary-vector -// target_ops: pto.vmin -// scenarios: core-i16-unsigned, full-mask -// ----------------------------------------------------------------------------- -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include - -using namespace PtoTestCommon; - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, \ - (int)_ret, __FILE__, __LINE__); \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVmin_i16_unsigned_kernel(uint16_t *v1, uint16_t *v2, uint16_t *v3, - void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(uint16_t); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(uint16_t); - size_t elemCount_v3 = 1024; - size_t fileSize_v3 = elemCount_v3 * sizeof(uint16_t); - uint16_t *v1Host = nullptr; - uint16_t *v1Device = nullptr; - uint16_t *v2Host = nullptr; - uint16_t *v2Device = nullptr; - uint16_t *v3Host = nullptr; - uint16_t *v3Device = nullptr; - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) - deviceId = std::atoi(envDevice); - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMallocHost((void **)(&v3Host), fileSize_v3)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v3Device, fileSize_v3, ACL_MEM_MALLOC_HUGE_FIRST)); - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ReadFile("./v3.bin", fileSize_v3, v3Host, fileSize_v3); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v3Device, fileSize_v3, v3Host, fileSize_v3, - ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVmin_i16_unsigned_kernel(v1Device, v2Device, v3Device, stream); - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v3Host, fileSize_v3, v3Device, fileSize_v3, - ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v3.bin", v3Host, fileSize_v3); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFree(v3Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - aclrtFreeHost(v3Host); - if (stream != nullptr) - aclrtDestroyStream(stream); - if (deviceSet) - aclrtResetDevice(deviceId); - if (aclInited) - aclFinalize(); - return rc; -} diff --git a/test/vpto/cases/micro-op/binary-vector/vmin-tail/compare.py b/test/vpto/cases/micro-op/binary-vector/vmin-tail/compare.py deleted file mode 100644 index c95419953..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vmin-tail/compare.py +++ /dev/null @@ -1,39 +0,0 @@ -#!/usr/bin/python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - - -import os -import sys -import numpy as np - - -def compare_bin_prefix(golden_path, output_path, dtype, eps, count): - if not os.path.exists(golden_path) or not os.path.exists(output_path): - return False - golden = np.fromfile(golden_path, dtype=dtype, count=count) - output = np.fromfile(output_path, dtype=dtype, count=count) - return golden.shape == output.shape and np.allclose( - golden, output, atol=eps, rtol=eps, equal_nan=True - ) - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_bin_prefix("golden_v3.bin", "v3.bin", np.float32, 1e-4, 1000) - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/binary-vector/vmin-tail/golden.py b/test/vpto/cases/micro-op/binary-vector/vmin-tail/golden.py deleted file mode 100644 index 29bbdcd28..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vmin-tail/golden.py +++ /dev/null @@ -1,51 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -ROWS = 32 -COLS = 32 -SEED = 19 -LOGICAL_ELEMS = 1000 -OUT_SENTINEL = np.float32(-123.25) - - -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) - v1 = rng.uniform(-8.0, 8.0, size=(ROWS, COLS)).astype(np.float32) - v2 = rng.uniform(-8.0, 8.0, size=(ROWS, COLS)).astype(np.float32) - v3 = np.full((ROWS, COLS), OUT_SENTINEL, dtype=np.float32) - golden_v3 = np.full((ROWS, COLS), OUT_SENTINEL, dtype=np.float32) - golden_v3.reshape(-1)[:LOGICAL_ELEMS] = np.minimum( - v1.reshape(-1)[:LOGICAL_ELEMS], v2.reshape(-1)[:LOGICAL_ELEMS] - ).astype(np.float32, copy=False) - - output_dir.mkdir(parents=True, exist_ok=True) - v1.reshape(-1).tofile(output_dir / "v1.bin") - v2.reshape(-1).tofile(output_dir / "v2.bin") - v3.reshape(-1).tofile(output_dir / "v3.bin") - golden_v3.reshape(-1).tofile(output_dir / "golden_v3.bin") - - -def main() -> None: - parser = argparse.ArgumentParser() - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/binary-vector/vmin-tail/kernel.pto b/test/vpto/cases/micro-op/binary-vector/vmin-tail/kernel.pto deleted file mode 100644 index e73d73821..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vmin-tail/kernel.pto +++ /dev/null @@ -1,49 +0,0 @@ -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vmin_tail_kernel_2d(%arg0: !pto.ptr, %arg1: !pto.ptr, - %arg2: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c64 = arith.constant 64 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c128_i64 = arith.constant 128 : i64 - %c4096_i64 = arith.constant 4096 : i64 - %c8192_i64 = arith.constant 8192 : i64 - %c1000_i32 = arith.constant 1000 : i32 - - %ub_lhs = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_rhs = pto.castptr %c4096_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c8192_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_lhs, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - pto.mte_gm_ub %arg1, %ub_rhs, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %_:1 = scf.for %offset = %c0 to %c1024 step %c64 iter_args(%remaining = %c1000_i32) -> (i32) { - %mask, %next_remaining = pto.plt_b32 %remaining : i32 -> !pto.mask, i32 - %lhs = pto.vlds %ub_lhs[%offset] : !pto.ptr -> !pto.vreg<64xf32> - %rhs = pto.vlds %ub_rhs[%offset] : !pto.ptr -> !pto.vreg<64xf32> - %out = pto.vmin %lhs, %rhs, %mask : !pto.vreg<64xf32>, !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> - pto.vsts %out, %ub_out[%offset], %mask : !pto.vreg<64xf32>, !pto.ptr, !pto.mask - scf.yield %next_remaining : i32 - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg2, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/binary-vector/vmin-tail/launch.cpp b/test/vpto/cases/micro-op/binary-vector/vmin-tail/launch.cpp deleted file mode 100644 index f2a890c47..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vmin-tail/launch.cpp +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vmin_tail_kernel_2d(__gm__ float *v1, - __gm__ float *v2, - __gm__ float *v3); - -void LaunchVmin_tail_kernel_2d(float *v1, float *v2, float *v3, void *stream) { - vmin_tail_kernel_2d<<<1, nullptr, stream>>>((__gm__ float *)v1, - (__gm__ float *)v2, - (__gm__ float *)v3); -} diff --git a/test/vpto/cases/micro-op/binary-vector/vmin-tail/main.cpp b/test/vpto/cases/micro-op/binary-vector/vmin-tail/main.cpp deleted file mode 100644 index 5a418b3da..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vmin-tail/main.cpp +++ /dev/null @@ -1,94 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include - -using namespace PtoTestCommon; - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, \ - (int)_ret, __FILE__, __LINE__); \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVmin_tail_kernel_2d(float *v1, float *v2, float *v3, void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(float); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(float); - size_t elemCount_v3 = 1024; - size_t fileSize_v3 = elemCount_v3 * sizeof(float); - float *v1Host = nullptr; - float *v1Device = nullptr; - float *v2Host = nullptr; - float *v2Device = nullptr; - float *v3Host = nullptr; - float *v3Device = nullptr; - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) - deviceId = std::atoi(envDevice); - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMallocHost((void **)(&v3Host), fileSize_v3)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v3Device, fileSize_v3, ACL_MEM_MALLOC_HUGE_FIRST)); - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ReadFile("./v3.bin", fileSize_v3, v3Host, fileSize_v3); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v3Device, fileSize_v3, v3Host, fileSize_v3, - ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVmin_tail_kernel_2d(v1Device, v2Device, v3Device, stream); - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v3Host, fileSize_v3, v3Device, fileSize_v3, - ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v3.bin", v3Host, fileSize_v3); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFree(v3Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - aclrtFreeHost(v3Host); - if (stream != nullptr) - aclrtDestroyStream(stream); - if (deviceSet) - aclrtResetDevice(deviceId); - if (aclInited) - aclFinalize(); - return rc; -} diff --git a/test/vpto/cases/micro-op/binary-vector/vmin/compare.py b/test/vpto/cases/micro-op/binary-vector/vmin/compare.py index a5f14dabc..491cdcd9a 100644 --- a/test/vpto/cases/micro-op/binary-vector/vmin/compare.py +++ b/test/vpto/cases/micro-op/binary-vector/vmin/compare.py @@ -7,31 +7,70 @@ # INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. # See LICENSE in the root of the software repository for the full text of the License. - -import os -import sys +# Merged vmin compare. +import os, sys import numpy as np - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(golden_path) or not os.path.exists(output_path): +def _cmp(golden, output, dtype, eps, count=-1): + if not os.path.exists(golden) or not os.path.exists(output): return False - golden = np.fromfile(golden_path, dtype=dtype) - output = np.fromfile(output_path, dtype=dtype) - return golden.shape == output.shape and np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True) + kw = {} if count < 0 else {"count": count} + g = np.fromfile(golden, dtype=dtype, **kw) + o = np.fromfile(output, dtype=dtype, **kw) + return g.shape == o.shape and np.allclose(g, o, atol=eps, rtol=eps, equal_nan=True) +def _cmpeq(golden, output, dtype): + if not os.path.exists(golden) or not os.path.exists(output): + return False + g = np.fromfile(golden, dtype=dtype) + o = np.fromfile(output, dtype=dtype) + return g.shape == o.shape and np.array_equal(g, o) def main(): strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_bin("golden_v3.bin", "v3.bin", np.float32, 1e-4) - if not ok: + failed = [] + if not (_cmp("golden_v3.bin","v3.bin",np.float32,1e-4,-1)): + failed.append('f32') + print('[ERROR] compare failed: f32') + else: + print('[INFO] f32: passed') + if not (_cmp("golden_v3_f16.bin","v3_f16.bin",np.float16,5e-3,1024)): + failed.append('f16') + print('[ERROR] compare failed: f16') + else: + print('[INFO] f16: passed') + if not (_cmp("golden_v3_bf16.bin","v3_bf16.bin",np.uint16,0,1024)): + failed.append('bf16') + print('[ERROR] compare failed: bf16') + else: + print('[INFO] bf16: passed') + if not (_cmp("golden_v3_f32_exceptional.bin","v3_f32_exceptional.bin",np.float32,1e-4,-1)): + failed.append('f32_exceptional') + print('[ERROR] compare failed: f32_exceptional') + else: + print('[INFO] f32_exceptional: passed') + if not (_cmp("golden_v3_i16_signed.bin","v3_i16_signed.bin",np.int16,0,1024)): + failed.append('i16_signed') + print('[ERROR] compare failed: i16_signed') + else: + print('[INFO] i16_signed: passed') + if not (_cmpeq("golden_v3_i16_unsigned.bin","v3_i16_unsigned.bin",np.uint16)): + failed.append('i16_unsigned') + print('[ERROR] compare failed: i16_unsigned') + else: + print('[INFO] i16_unsigned: passed') + if not (_cmp("golden_v3_tail.bin","v3_tail.bin",np.float32,1e-4,1000)): + failed.append('tail') + print('[ERROR] compare failed: tail') + else: + print('[INFO] tail: passed') + if failed: if strict: - print("[ERROR] compare failed") + print(f"[ERROR] {len(failed)} variant(s) failed: {','.join(failed)}") sys.exit(2) - print("[WARN] compare failed (non-gating)") + print(f"[WARN] {len(failed)} variant(s) failed (non-gating): {','.join(failed)}") return - print("[INFO] compare passed") - + print("[INFO] compare passed (all 7 variants)") if __name__ == "__main__": main() diff --git a/test/vpto/cases/micro-op/binary-vector/vmin/golden.py b/test/vpto/cases/micro-op/binary-vector/vmin/golden.py index 6d18ab792..b1228cb1f 100644 --- a/test/vpto/cases/micro-op/binary-vector/vmin/golden.py +++ b/test/vpto/cases/micro-op/binary-vector/vmin/golden.py @@ -7,40 +7,121 @@ # INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. # See LICENSE in the root of the software repository for the full text of the License. -# coding=utf-8 - +# Merged vmin golden data generator. import argparse from pathlib import Path - import numpy as np - ROWS = 32 COLS = 32 SEED = 19 +LOGICAL_ELEMS = 1000 +OUT_SENTINEL = np.float32(-123.25) +def f32_to_bf16_bits(values): + wide = values.astype(np.float32, copy=False).view(np.uint32) + rounding = np.uint32(0x7FFF) + ((wide >> 16) & np.uint32(1)) + return ((wide + rounding) >> 16).astype(np.uint16) -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) - v1 = rng.random((ROWS, COLS), dtype=np.float32) - v2 = rng.random((ROWS, COLS), dtype=np.float32) - golden_v3 = np.minimum(v1, v2) +def bf16_bits_to_f32(bits): + return (bits.astype(np.uint32) << 16).view(np.float32) + +def gen_f32(out, rng): + v1 = rng.uniform(-8.0, 8.0, size=(ROWS, COLS)).astype(np.float32) + v2 = rng.uniform(-8.0, 8.0, size=(ROWS, COLS)).astype(np.float32) + g = (np.minimum(v1, v2)).astype(np.float32, copy=False) v3 = np.zeros((ROWS, COLS), dtype=np.float32) + v1.reshape(-1).tofile(out / "v1.bin") + v2.reshape(-1).tofile(out / "v2.bin") + v3.reshape(-1).tofile(out / "v3.bin") + g.reshape(-1).tofile(out / "golden_v3.bin") + +def gen_f16(out, rng): + v1 = rng.uniform(-8.0, 8.0, size=(ROWS, COLS)).astype(np.float16) + v2 = rng.uniform(-8.0, 8.0, size=(ROWS, COLS)).astype(np.float16) + g = (np.minimum(v1.astype(np.float32), v2.astype(np.float32))).astype(np.float16) + v3 = np.zeros((ROWS, COLS), dtype=np.float16) + v1.reshape(-1).tofile(out / "v1_f16.bin") + v2.reshape(-1).tofile(out / "v2_f16.bin") + v3.reshape(-1).tofile(out / "v3_f16.bin") + g.reshape(-1).tofile(out / "golden_v3_f16.bin") - output_dir.mkdir(parents=True, exist_ok=True) - v1.reshape(-1).tofile(output_dir / "v1.bin") - v2.reshape(-1).tofile(output_dir / "v2.bin") - v3.reshape(-1).tofile(output_dir / "v3.bin") - golden_v3.astype(np.float32, copy=False).reshape(-1).tofile(output_dir / "golden_v3.bin") +def gen_bf16(out, rng): + elems = ROWS * COLS + v1_f32 = rng.uniform(-4.0, 4.0, size=elems).astype(np.float32) + v2_f32 = rng.uniform(-4.0, 4.0, size=elems).astype(np.float32) + v1 = f32_to_bf16_bits(v1_f32) + v2 = f32_to_bf16_bits(v2_f32) + g = (f32_to_bf16_bits(np.minimum(bf16_bits_to_f32(v1), bf16_bits_to_f32(v2)))) + v3 = np.zeros(elems, dtype=np.uint16) + v1.tofile(out / "v1_bf16.bin") + v2.tofile(out / "v2_bf16.bin") + v3.tofile(out / "v3_bf16.bin") + g.tofile(out / "golden_v3_bf16.bin") +def gen_f32_exceptional(out, rng): + specials_a = np.array([-np.inf, -7.5, -0.0, 0.0, 1.0, np.inf, np.nan, 3.5], dtype=np.float32) + specials_b = np.array([np.inf, 2.5, 0.0, -0.0, -1.0, -np.inf, 1.0, np.nan], dtype=np.float32) + v1 = np.resize(specials_a, ROWS * COLS).reshape(ROWS, COLS).astype(np.float32) + v2 = np.resize(specials_b, ROWS * COLS).reshape(ROWS, COLS).astype(np.float32) + g = (np.minimum(v1, v2)).astype(np.float32, copy=False) + v3 = np.zeros((ROWS, COLS), dtype=np.float32) + v1.reshape(-1).tofile(out / "v1_f32_exceptional.bin") + v2.reshape(-1).tofile(out / "v2_f32_exceptional.bin") + v3.reshape(-1).tofile(out / "v3_f32_exceptional.bin") + g.reshape(-1).tofile(out / "golden_v3_f32_exceptional.bin") + +def gen_i16_signed(out, rng): + v1 = rng.integers(-1000, 1001, size=(ROWS, COLS), dtype=np.int16) + v2 = rng.integers(-1000, 1001, size=(ROWS, COLS), dtype=np.int16) + g = (np.minimum(v1.astype(np.int32), v2.astype(np.int32))).astype(np.int16) + v3 = np.zeros((ROWS, COLS), dtype=np.int16) + v1.reshape(-1).tofile(out / "v1_i16_signed.bin") + v2.reshape(-1).tofile(out / "v2_i16_signed.bin") + v3.reshape(-1).tofile(out / "v3_i16_signed.bin") + g.reshape(-1).tofile(out / "golden_v3_i16_signed.bin") + +def gen_i16_unsigned(out, rng): + v1 = rng.integers(0, 2001, size=(ROWS, COLS), dtype=np.uint16) + v2 = rng.integers(0, 2001, size=(ROWS, COLS), dtype=np.uint16) + g = (np.minimum(v1.astype(np.uint32), v2.astype(np.uint32))).astype(np.uint16) + v3 = np.zeros((ROWS, COLS), dtype=np.uint16) + v1.reshape(-1).tofile(out / "v1_i16_unsigned.bin") + v2.reshape(-1).tofile(out / "v2_i16_unsigned.bin") + v3.reshape(-1).tofile(out / "v3_i16_unsigned.bin") + g.reshape(-1).tofile(out / "golden_v3_i16_unsigned.bin") + +def gen_tail(out, rng): + v1 = rng.random((ROWS, COLS), dtype=np.float32) + v2 = rng.random((ROWS, COLS), dtype=np.float32) + v3 = np.full((ROWS, COLS), OUT_SENTINEL, dtype=np.float32) + g = np.full((ROWS, COLS), OUT_SENTINEL, dtype=np.float32) + g.reshape(-1)[:LOGICAL_ELEMS] = (np.minimum(v1.reshape(-1)[:LOGICAL_ELEMS], v2.reshape(-1)[:LOGICAL_ELEMS])).astype(np.float32, copy=False) + v1.reshape(-1).tofile(out / "v1_tail.bin") + v2.reshape(-1).tofile(out / "v2_tail.bin") + v3.reshape(-1).tofile(out / "v3_tail.bin") + g.reshape(-1).tofile(out / "golden_v3_tail.bin") -def main() -> None: - parser = argparse.ArgumentParser() - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) +GENERATORS = [ + gen_f32, + gen_f16, + gen_bf16, + gen_f32_exceptional, + gen_i16_signed, + gen_i16_unsigned, + gen_tail, +] +def main(): + p = argparse.ArgumentParser() + p.add_argument("--output-dir", type=Path, default=Path(".")) + p.add_argument("--seed", type=int, default=SEED) + a = p.parse_args() + rng = np.random.default_rng(a.seed) + out = a.output_dir + out.mkdir(parents=True, exist_ok=True) + for gen in GENERATORS: + gen(out, rng) if __name__ == "__main__": main() diff --git a/test/vpto/cases/micro-op/binary-vector/vmin/kernel.pto b/test/vpto/cases/micro-op/binary-vector/vmin/kernel.pto index 4d1789609..a216cfa92 100644 --- a/test/vpto/cases/micro-op/binary-vector/vmin/kernel.pto +++ b/test/vpto/cases/micro-op/binary-vector/vmin/kernel.pto @@ -1,50 +1,321 @@ module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @min_kernel_2d(%arg0: !pto.ptr, %arg1: !pto.ptr, - %arg2: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c64 = arith.constant 64 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c128_i64 = arith.constant 128 : i64 - %c4096_i64 = arith.constant 4096 : i64 - %c8192_i64 = arith.constant 8192 : i64 - %c1024_i32 = arith.constant 1024 : i32 - - %ub_lhs = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_rhs = pto.castptr %c4096_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c8192_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_lhs, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) + func.func @vmin_deep_merged_kernel(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: !pto.ptr, %arg4: !pto.ptr, %arg5: !pto.ptr, %arg6: !pto.ptr, %arg7: !pto.ptr, %arg8: !pto.ptr, %arg9: !pto.ptr, %arg10: !pto.ptr, %arg11: !pto.ptr, %arg12: !pto.ptr, %arg13: !pto.ptr, %arg14: !pto.ptr, %arg15: !pto.ptr, %arg16: !pto.ptr, %arg17: !pto.ptr, %arg18: !pto.ptr, %arg19: !pto.ptr, %arg20: !pto.ptr) attributes {pto.kernel} { + // merged from min_kernel_2d via LaunchMin_kernel_2d + + %c0_m0 = arith.constant 0 : index + %c1_m0 = arith.constant 1 : index + %c64_m0 = arith.constant 64 : index + %c1024_m0 = arith.constant 1024 : index + %c0_i64_m0 = arith.constant 0 : i64 + %c1_i64_m0 = arith.constant 1 : i64 + %c32_i64_m0 = arith.constant 32 : i64 + %c128_i64_m0 = arith.constant 128 : i64 + %c4096_i64_m0 = arith.constant 4096 : i64 + %c8192_i64_m0 = arith.constant 8192 : i64 + %c1024_i32_m0 = arith.constant 1024 : i32 + + %ub_lhs_m0 = pto.castptr %c0_i64_m0 : i64 -> !pto.ptr + %ub_rhs_m0 = pto.castptr %c4096_i64_m0 : i64 -> !pto.ptr + %ub_out_m0 = pto.castptr %c8192_i64_m0 : i64 -> !pto.ptr + + %false_m0 = arith.constant false + pto.mte_gm_ub %arg0, %ub_lhs_m0, %c0_i64_m0, %c128_i64_m0 + nburst(%c32_i64_m0, %c128_i64_m0, %c128_i64_m0) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + pto.mte_gm_ub %arg1, %ub_rhs_m0, %c0_i64_m0, %c128_i64_m0 + nburst(%c32_i64_m0, %c128_i64_m0, %c128_i64_m0) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %__m0:1 = scf.for %offset_m0 = %c0_m0 to %c1024_m0 step %c64_m0 iter_args(%remaining_m0 = %c1024_i32_m0) -> (i32) { + %mask_m0, %next_remaining_m0 = pto.plt_b32 %remaining_m0 : i32 -> !pto.mask, i32 + %lhs_m0 = pto.vlds %ub_lhs_m0[%offset_m0] : !pto.ptr -> !pto.vreg<64xf32> + %rhs_m0 = pto.vlds %ub_rhs_m0[%offset_m0] : !pto.ptr -> !pto.vreg<64xf32> + %minv_m0 = pto.vmin %lhs_m0, %rhs_m0, %mask_m0 : !pto.vreg<64xf32>, !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> + pto.vsts %minv_m0, %ub_out_m0[%offset_m0], %mask_m0 : !pto.vreg<64xf32>, !pto.ptr, !pto.mask + scf.yield %next_remaining_m0 : i32 + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m0, %arg2, %c128_i64_m0 + nburst(%c32_i64_m0, %c128_i64_m0, %c128_i64_m0) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + + // merged from min_kernel_2d via LaunchMin_kernel_2d + + %c0_m1 = arith.constant 0 : index + %c1_m1 = arith.constant 1 : index + %c64_m1 = arith.constant 64 : index + %c1024_m1 = arith.constant 1024 : index + %c0_i64_m1 = arith.constant 0 : i64 + %c1_i64_m1 = arith.constant 1 : i64 + %c32_i64_m1 = arith.constant 32 : i64 + %c128_i64_m1 = arith.constant 128 : i64 + %c4096_i64_m1 = arith.constant 4096 : i64 + %c8192_i64_m1 = arith.constant 8192 : i64 + %c1024_i32_m1 = arith.constant 1024 : i32 + + %ub_lhs_m1 = pto.castptr %c0_i64_m1 : i64 -> !pto.ptr + %ub_rhs_m1 = pto.castptr %c4096_i64_m1 : i64 -> !pto.ptr + %ub_out_m1 = pto.castptr %c8192_i64_m1 : i64 -> !pto.ptr + + %false_m1 = arith.constant false + pto.mte_gm_ub %arg3, %ub_lhs_m1, %c0_i64_m1, %c128_i64_m1 + nburst(%c32_i64_m1, %c128_i64_m1, %c128_i64_m1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + pto.mte_gm_ub %arg4, %ub_rhs_m1, %c0_i64_m1, %c128_i64_m1 + nburst(%c32_i64_m1, %c128_i64_m1, %c128_i64_m1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %__m1:1 = scf.for %offset_m1 = %c0_m1 to %c1024_m1 step %c64_m1 iter_args(%remaining_m1 = %c1024_i32_m1) -> (i32) { + %mask_m1, %next_remaining_m1 = pto.plt_b32 %remaining_m1 : i32 -> !pto.mask, i32 + %lhs_m1 = pto.vlds %ub_lhs_m1[%offset_m1] : !pto.ptr -> !pto.vreg<64xf32> + %rhs_m1 = pto.vlds %ub_rhs_m1[%offset_m1] : !pto.ptr -> !pto.vreg<64xf32> + %minv_m1 = pto.vmin %lhs_m1, %rhs_m1, %mask_m1 : !pto.vreg<64xf32>, !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> + pto.vsts %minv_m1, %ub_out_m1[%offset_m1], %mask_m1 : !pto.vreg<64xf32>, !pto.ptr, !pto.mask + scf.yield %next_remaining_m1 : i32 + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m1, %arg5, %c128_i64_m1 + nburst(%c32_i64_m1, %c128_i64_m1, %c128_i64_m1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + + // merged from vmin_f16_kernel via LaunchVmin_f16_kernel + + %c0_m2 = arith.constant 0 : index + %c1024_m2 = arith.constant 1024 : index + %c128_m2 = arith.constant 128 : index + %c0_i64_m2 = arith.constant 0 : i64 + %c1_i64_m2 = arith.constant 1 : i64 + %c32_i64_m2 = arith.constant 32 : i64 + %c64_i64_m2 = arith.constant 64 : i64 + %c2048_i64_m2 = arith.constant 2048 : i64 + %c4096_i64_m2 = arith.constant 4096 : i64 + + %ub_lhs_m2 = pto.castptr %c0_i64_m2 : i64 -> !pto.ptr + %ub_rhs_m2 = pto.castptr %c2048_i64_m2 : i64 -> !pto.ptr + %ub_out_m2 = pto.castptr %c4096_i64_m2 : i64 -> !pto.ptr + + %false_m2 = arith.constant false + pto.mte_gm_ub %arg6, %ub_lhs_m2, %c0_i64_m2, %c64_i64_m2 + nburst(%c32_i64_m2, %c64_i64_m2, %c64_i64_m2) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + pto.mte_gm_ub %arg7, %ub_rhs_m2, %c0_i64_m2, %c64_i64_m2 + nburst(%c32_i64_m2, %c64_i64_m2, %c64_i64_m2) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %mask_m2 = pto.pset_b16 "PAT_ALL" : !pto.mask + scf.for %offset_m2 = %c0_m2 to %c1024_m2 step %c128_m2 { + %lhs_m2 = pto.vlds %ub_lhs_m2[%offset_m2] : !pto.ptr -> !pto.vreg<128xf16> + %rhs_m2 = pto.vlds %ub_rhs_m2[%offset_m2] : !pto.ptr -> !pto.vreg<128xf16> + %out_m2 = pto.vmin %lhs_m2, %rhs_m2, %mask_m2 : !pto.vreg<128xf16>, !pto.vreg<128xf16>, !pto.mask -> !pto.vreg<128xf16> + pto.vsts %out_m2, %ub_out_m2[%offset_m2], %mask_m2 : !pto.vreg<128xf16>, !pto.ptr, !pto.mask + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m2, %arg8, %c64_i64_m2 + nburst(%c32_i64_m2, %c64_i64_m2, %c64_i64_m2) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + + // merged from vmin_bf16_kernel via LaunchVmin_bf16_kernel + + %c0_m3 = arith.constant 0 : index + %c1024_m3 = arith.constant 1024 : index + %c128_m3 = arith.constant 128 : index + %c0_i64_m3 = arith.constant 0 : i64 + %c1_i64_m3 = arith.constant 1 : i64 + %c32_i64_m3 = arith.constant 32 : i64 + %c64_i64_m3 = arith.constant 64 : i64 + %c2048_i64_m3 = arith.constant 2048 : i64 + %c4096_i64_m3 = arith.constant 4096 : i64 + + %ub_lhs_m3 = pto.castptr %c0_i64_m3 : i64 -> !pto.ptr + %ub_rhs_m3 = pto.castptr %c2048_i64_m3 : i64 -> !pto.ptr + %ub_out_m3 = pto.castptr %c4096_i64_m3 : i64 -> !pto.ptr + + %false_m3 = arith.constant false + pto.mte_gm_ub %arg9, %ub_lhs_m3, %c0_i64_m3, %c64_i64_m3 + nburst(%c32_i64_m3, %c64_i64_m3, %c64_i64_m3) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + pto.mte_gm_ub %arg10, %ub_rhs_m3, %c0_i64_m3, %c64_i64_m3 + nburst(%c32_i64_m3, %c64_i64_m3, %c64_i64_m3) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %mask_m3 = pto.pset_b16 "PAT_ALL" : !pto.mask + scf.for %offset_m3 = %c0_m3 to %c1024_m3 step %c128_m3 { + %lhs_m3 = pto.vlds %ub_lhs_m3[%offset_m3] : !pto.ptr -> !pto.vreg<128xbf16> + %rhs_m3 = pto.vlds %ub_rhs_m3[%offset_m3] : !pto.ptr -> !pto.vreg<128xbf16> + %out_m3 = pto.vmin %lhs_m3, %rhs_m3, %mask_m3 : !pto.vreg<128xbf16>, !pto.vreg<128xbf16>, !pto.mask -> !pto.vreg<128xbf16> + pto.vsts %out_m3, %ub_out_m3[%offset_m3], %mask_m3 : !pto.vreg<128xbf16>, !pto.ptr, !pto.mask + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m3, %arg11, %c64_i64_m3 + nburst(%c32_i64_m3, %c64_i64_m3, %c64_i64_m3) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + + // merged from vmin_i16_signed_kernel via LaunchVmin_i16_signed_kernel + + %c0_m4 = arith.constant 0 : index + %c1024_m4 = arith.constant 1024 : index + %c128_m4 = arith.constant 128 : index + %c0_i64_m4 = arith.constant 0 : i64 + %c1_i64_m4 = arith.constant 1 : i64 + %c32_i64_m4 = arith.constant 32 : i64 + %c64_i64_m4 = arith.constant 64 : i64 + %c2048_i64_m4 = arith.constant 2048 : i64 + %c4096_i64_m4 = arith.constant 4096 : i64 + + %ub_lhs_m4 = pto.castptr %c0_i64_m4 : i64 -> !pto.ptr + %ub_rhs_m4 = pto.castptr %c2048_i64_m4 : i64 -> !pto.ptr + %ub_out_m4 = pto.castptr %c4096_i64_m4 : i64 -> !pto.ptr + + %false_m4 = arith.constant false + pto.mte_gm_ub %arg12, %ub_lhs_m4, %c0_i64_m4, %c64_i64_m4 + nburst(%c32_i64_m4, %c64_i64_m4, %c64_i64_m4) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + pto.mte_gm_ub %arg13, %ub_rhs_m4, %c0_i64_m4, %c64_i64_m4 + nburst(%c32_i64_m4, %c64_i64_m4, %c64_i64_m4) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %mask_m4 = pto.pset_b16 "PAT_ALL" : !pto.mask + scf.for %offset_m4 = %c0_m4 to %c1024_m4 step %c128_m4 { + %lhs_m4 = pto.vlds %ub_lhs_m4[%offset_m4] : !pto.ptr -> !pto.vreg<128xi16> + %rhs_m4 = pto.vlds %ub_rhs_m4[%offset_m4] : !pto.ptr -> !pto.vreg<128xi16> + %out_m4 = pto.vmin %lhs_m4, %rhs_m4, %mask_m4 : !pto.vreg<128xi16>, !pto.vreg<128xi16>, !pto.mask -> !pto.vreg<128xi16> + pto.vsts %out_m4, %ub_out_m4[%offset_m4], %mask_m4 : !pto.vreg<128xi16>, !pto.ptr, !pto.mask + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m4, %arg14, %c64_i64_m4 + nburst(%c32_i64_m4, %c64_i64_m4, %c64_i64_m4) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + + // merged from vmin_i16_unsigned_kernel via LaunchVmin_i16_unsigned_kernel + + %c0_m5 = arith.constant 0 : index + %c1024_m5 = arith.constant 1024 : index + %c128_m5 = arith.constant 128 : index + %c0_i64_m5 = arith.constant 0 : i64 + %c1_i64_m5 = arith.constant 1 : i64 + %c32_i64_m5 = arith.constant 32 : i64 + %c64_i64_m5 = arith.constant 64 : i64 + %c2048_i64_m5 = arith.constant 2048 : i64 + %c4096_i64_m5 = arith.constant 4096 : i64 + + %ub_lhs_m5 = pto.castptr %c0_i64_m5 : i64 -> !pto.ptr + %ub_rhs_m5 = pto.castptr %c2048_i64_m5 : i64 -> !pto.ptr + %ub_out_m5 = pto.castptr %c4096_i64_m5 : i64 -> !pto.ptr + + %false_m5 = arith.constant false + pto.mte_gm_ub %arg15, %ub_lhs_m5, %c0_i64_m5, %c64_i64_m5 + nburst(%c32_i64_m5, %c64_i64_m5, %c64_i64_m5) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + pto.mte_gm_ub %arg16, %ub_rhs_m5, %c0_i64_m5, %c64_i64_m5 + nburst(%c32_i64_m5, %c64_i64_m5, %c64_i64_m5) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %mask_m5 = pto.pset_b16 "PAT_ALL" : !pto.mask + scf.for %offset_m5 = %c0_m5 to %c1024_m5 step %c128_m5 { + %lhs_m5 = pto.vlds %ub_lhs_m5[%offset_m5] : !pto.ptr -> !pto.vreg<128xui16> + %rhs_m5 = pto.vlds %ub_rhs_m5[%offset_m5] : !pto.ptr -> !pto.vreg<128xui16> + %out_m5 = pto.vmin %lhs_m5, %rhs_m5, %mask_m5 : !pto.vreg<128xui16>, !pto.vreg<128xui16>, !pto.mask -> !pto.vreg<128xui16> + pto.vsts %out_m5, %ub_out_m5[%offset_m5], %mask_m5 : !pto.vreg<128xui16>, !pto.ptr, !pto.mask + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m5, %arg17, %c64_i64_m5 + nburst(%c32_i64_m5, %c64_i64_m5, %c64_i64_m5) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + + // merged from vmin_tail_kernel_2d via LaunchVmin_tail_kernel_2d + + %c0_m6 = arith.constant 0 : index + %c64_m6 = arith.constant 64 : index + %c1024_m6 = arith.constant 1024 : index + %c0_i64_m6 = arith.constant 0 : i64 + %c1_i64_m6 = arith.constant 1 : i64 + %c32_i64_m6 = arith.constant 32 : i64 + %c128_i64_m6 = arith.constant 128 : i64 + %c4096_i64_m6 = arith.constant 4096 : i64 + %c8192_i64_m6 = arith.constant 8192 : i64 + %c1000_i32_m6 = arith.constant 1000 : i32 + + %ub_lhs_m6 = pto.castptr %c0_i64_m6 : i64 -> !pto.ptr + %ub_rhs_m6 = pto.castptr %c4096_i64_m6 : i64 -> !pto.ptr + %ub_out_m6 = pto.castptr %c8192_i64_m6 : i64 -> !pto.ptr + + %false_m6 = arith.constant false + pto.mte_gm_ub %arg18, %ub_lhs_m6, %c0_i64_m6, %c128_i64_m6 + nburst(%c32_i64_m6, %c128_i64_m6, %c128_i64_m6) : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - pto.mte_gm_ub %arg1, %ub_rhs, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) + pto.mte_gm_ub %arg19, %ub_rhs_m6, %c0_i64_m6, %c128_i64_m6 + nburst(%c32_i64_m6, %c128_i64_m6, %c128_i64_m6) : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] pto.vecscope { - %_:1 = scf.for %offset = %c0 to %c1024 step %c64 iter_args(%remaining = %c1024_i32) -> (i32) { - %mask, %next_remaining = pto.plt_b32 %remaining : i32 -> !pto.mask, i32 - %lhs = pto.vlds %ub_lhs[%offset] : !pto.ptr -> !pto.vreg<64xf32> - %rhs = pto.vlds %ub_rhs[%offset] : !pto.ptr -> !pto.vreg<64xf32> - %minv = pto.vmin %lhs, %rhs, %mask : !pto.vreg<64xf32>, !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> - pto.vsts %minv, %ub_out[%offset], %mask : !pto.vreg<64xf32>, !pto.ptr, !pto.mask - scf.yield %next_remaining : i32 + %__m6:1 = scf.for %offset_m6 = %c0_m6 to %c1024_m6 step %c64_m6 iter_args(%remaining_m6 = %c1000_i32_m6) -> (i32) { + %mask_m6, %next_remaining_m6 = pto.plt_b32 %remaining_m6 : i32 -> !pto.mask, i32 + %lhs_m6 = pto.vlds %ub_lhs_m6[%offset_m6] : !pto.ptr -> !pto.vreg<64xf32> + %rhs_m6 = pto.vlds %ub_rhs_m6[%offset_m6] : !pto.ptr -> !pto.vreg<64xf32> + %out_m6 = pto.vmin %lhs_m6, %rhs_m6, %mask_m6 : !pto.vreg<64xf32>, !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> + pto.vsts %out_m6, %ub_out_m6[%offset_m6], %mask_m6 : !pto.vreg<64xf32>, !pto.ptr, !pto.mask + scf.yield %next_remaining_m6 : i32 } } pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg2, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) + pto.mte_ub_gm %ub_out_m6, %arg20, %c128_i64_m6 + nburst(%c32_i64_m6, %c128_i64_m6, %c128_i64_m6) : !pto.ptr, !pto.ptr, i64, i64, i64, i64 pto.barrier #pto.pipe + return } } diff --git a/test/vpto/cases/micro-op/binary-vector/vmin/launch.cpp b/test/vpto/cases/micro-op/binary-vector/vmin/launch.cpp index f2c64c6a6..2640aaf64 100644 --- a/test/vpto/cases/micro-op/binary-vector/vmin/launch.cpp +++ b/test/vpto/cases/micro-op/binary-vector/vmin/launch.cpp @@ -5,11 +5,10 @@ // THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, // INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. // See LICENSE in the root of the software repository for the full text of the License. - +// ... license ... #ifndef __VEC_SCOPE__ #define __VEC_SCOPE__ #endif - #if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) typedef struct { unsigned char v; } hifloat8_t; typedef struct { unsigned char v; } float8_e4m3_t; @@ -22,14 +21,8 @@ typedef struct { unsigned char v; } float4_e2m1x2_t; #if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) #include #endif - #if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; +struct MrgSortExecutedNumList { uint16_t mrgSortList0,mrgSortList1,mrgSortList2,mrgSortList3; }; #endif #ifndef __CPU_SIM #include "acl/acl.h" @@ -39,8 +32,50 @@ extern "C" __global__ [aicore] void min_kernel_2d(__gm__ float *v1, __gm__ float *v2, __gm__ float *v3); -void LaunchMin_kernel_2d(float *v1, float *v2, float *v3, void *stream) { - min_kernel_2d<<<1, nullptr, stream>>>((__gm__ float *)v1, - (__gm__ float *)v2, - (__gm__ float *)v3); +extern "C" __global__ [aicore] void vmin_deep_merged_kernel( + __gm__ float * arg0, + __gm__ float * arg1, + __gm__ float * arg2, + __gm__ float * arg3, + __gm__ float * arg4, + __gm__ float * arg5, + __gm__ half * arg6, + __gm__ half * arg7, + __gm__ half * arg8, + __gm__ bfloat16_t * arg9, + __gm__ bfloat16_t * arg10, + __gm__ bfloat16_t * arg11, + __gm__ int16_t * arg12, + __gm__ int16_t * arg13, + __gm__ int16_t * arg14, + __gm__ uint16_t * arg15, + __gm__ uint16_t * arg16, + __gm__ uint16_t * arg17, + __gm__ float * arg18, + __gm__ float * arg19, + __gm__ float * arg20); + +void LaunchVminDeepMerged(float * p0, float * p1, float * p2, float * p3, float * p4, float * p5, uint16_t * p6, uint16_t * p7, uint16_t * p8, uint16_t * p9, uint16_t * p10, uint16_t * p11, int16_t * p12, int16_t * p13, int16_t * p14, uint16_t * p15, uint16_t * p16, uint16_t * p17, float * p18, float * p19, float * p20, void *stream) { + vmin_deep_merged_kernel<<<1, nullptr, stream>>>( + (__gm__ float *)p0, + (__gm__ float *)p1, + (__gm__ float *)p2, + (__gm__ float *)p3, + (__gm__ float *)p4, + (__gm__ float *)p5, + (__gm__ half *)p6, + (__gm__ half *)p7, + (__gm__ half *)p8, + (__gm__ bfloat16_t *)p9, + (__gm__ bfloat16_t *)p10, + (__gm__ bfloat16_t *)p11, + (__gm__ int16_t *)p12, + (__gm__ int16_t *)p13, + (__gm__ int16_t *)p14, + (__gm__ uint16_t *)p15, + (__gm__ uint16_t *)p16, + (__gm__ uint16_t *)p17, + (__gm__ float *)p18, + (__gm__ float *)p19, + (__gm__ float *)p20); } diff --git a/test/vpto/cases/micro-op/binary-vector/vmin/main.cpp b/test/vpto/cases/micro-op/binary-vector/vmin/main.cpp index b952b76a0..dd1877c3d 100644 --- a/test/vpto/cases/micro-op/binary-vector/vmin/main.cpp +++ b/test/vpto/cases/micro-op/binary-vector/vmin/main.cpp @@ -5,90 +5,229 @@ // THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, // INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. // See LICENSE in the root of the software repository for the full text of the License. - -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -*/ +// Merged vmin host runner. #include "test_common.h" #include "acl/acl.h" #include #include - +#include using namespace PtoTestCommon; -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, \ - (int)_ret, __FILE__, __LINE__); \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) +#define ACL_CHECK(expr) do { \ + const aclError _ret = (expr); \ + if (_ret != ACL_SUCCESS) { std::fprintf(stderr,"[ERROR] %s:%d acle=%d\n",#expr,__LINE__,(int)_ret); rc=1; goto cleanup; } \ +} while(0) -void LaunchMin_kernel_2d(float *v1, float *v2, float *v3, void *stream); +#define FCK(expr,path) do { if(!(expr)){std::fprintf(stderr,"[ERROR] file:%s\n",path);rc=1;goto cleanup;} } while(0) + +void LaunchVminDeepMerged(float * p0, float * p1, float * p2, float * p3, float * p4, float * p5, uint16_t * p6, uint16_t * p7, uint16_t * p8, uint16_t * p9, uint16_t * p10, uint16_t * p11, int16_t * p12, int16_t * p13, int16_t * p14, uint16_t * p15, uint16_t * p16, uint16_t * p17, float * p18, float * p19, float * p20, void *stream); int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(float); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(float); - size_t elemCount_v3 = 1024; - size_t fileSize_v3 = elemCount_v3 * sizeof(float); - float *v1Host = nullptr; - float *v1Device = nullptr; - float *v2Host = nullptr; - float *v2Device = nullptr; - float *v3Host = nullptr; - float *v3Device = nullptr; - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; + constexpr size_t SZ_f32 = 4096; + constexpr size_t SZ_f16 = 2048; + constexpr size_t SZ_bf16 = 2048; + constexpr size_t SZ_f32_exceptional = 4096; + constexpr size_t SZ_i16_signed = 2048; + constexpr size_t SZ_i16_unsigned = 2048; + constexpr size_t SZ_tail = 4096; + + float *h_f32_v1=nullptr, *d_f32_v1=nullptr; + float *h_f32_v2=nullptr, *d_f32_v2=nullptr; + float *h_f32_v3=nullptr, *d_f32_v3=nullptr; + uint16_t *h_f16_v1=nullptr, *d_f16_v1=nullptr; + uint16_t *h_f16_v2=nullptr, *d_f16_v2=nullptr; + uint16_t *h_f16_v3=nullptr, *d_f16_v3=nullptr; + uint16_t *h_bf16_v1=nullptr, *d_bf16_v1=nullptr; + uint16_t *h_bf16_v2=nullptr, *d_bf16_v2=nullptr; + uint16_t *h_bf16_v3=nullptr, *d_bf16_v3=nullptr; + float *h_f32_exceptional_v1=nullptr, *d_f32_exceptional_v1=nullptr; + float *h_f32_exceptional_v2=nullptr, *d_f32_exceptional_v2=nullptr; + float *h_f32_exceptional_v3=nullptr, *d_f32_exceptional_v3=nullptr; + int16_t *h_i16_signed_v1=nullptr, *d_i16_signed_v1=nullptr; + int16_t *h_i16_signed_v2=nullptr, *d_i16_signed_v2=nullptr; + int16_t *h_i16_signed_v3=nullptr, *d_i16_signed_v3=nullptr; + uint16_t *h_i16_unsigned_v1=nullptr, *d_i16_unsigned_v1=nullptr; + uint16_t *h_i16_unsigned_v2=nullptr, *d_i16_unsigned_v2=nullptr; + uint16_t *h_i16_unsigned_v3=nullptr, *d_i16_unsigned_v3=nullptr; + float *h_tail_v1=nullptr, *d_tail_v1=nullptr; + float *h_tail_v2=nullptr, *d_tail_v2=nullptr; + float *h_tail_v3=nullptr, *d_tail_v3=nullptr; - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) - deviceId = std::atoi(envDevice); - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; + int rc=0; bool aclInited=false,deviceSet=false; int deviceId=0; aclrtStream stream=nullptr; size_t fsz=0; + ACL_CHECK(aclInit(nullptr)); aclInited=true; + if(const char*e=std::getenv("ACL_DEVICE_ID")) deviceId=std::atoi(e); + ACL_CHECK(aclrtSetDevice(deviceId)); deviceSet=true; ACL_CHECK(aclrtCreateStream(&stream)); - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMallocHost((void **)(&v3Host), fileSize_v3)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v3Device, fileSize_v3, ACL_MEM_MALLOC_HUGE_FIRST)); - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ReadFile("./v3.bin", fileSize_v3, v3Host, fileSize_v3); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v3Device, fileSize_v3, v3Host, fileSize_v3, - ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchMin_kernel_2d(v1Device, v2Device, v3Device, stream); + ACL_CHECK(aclrtMallocHost((void**)&h_f32_v1,SZ_f32)); + ACL_CHECK(aclrtMallocHost((void**)&h_f32_v2,SZ_f32)); + ACL_CHECK(aclrtMallocHost((void**)&h_f32_v3,SZ_f32)); + ACL_CHECK(aclrtMallocHost((void**)&h_f16_v1,SZ_f16)); + ACL_CHECK(aclrtMallocHost((void**)&h_f16_v2,SZ_f16)); + ACL_CHECK(aclrtMallocHost((void**)&h_f16_v3,SZ_f16)); + ACL_CHECK(aclrtMallocHost((void**)&h_bf16_v1,SZ_bf16)); + ACL_CHECK(aclrtMallocHost((void**)&h_bf16_v2,SZ_bf16)); + ACL_CHECK(aclrtMallocHost((void**)&h_bf16_v3,SZ_bf16)); + ACL_CHECK(aclrtMallocHost((void**)&h_f32_exceptional_v1,SZ_f32_exceptional)); + ACL_CHECK(aclrtMallocHost((void**)&h_f32_exceptional_v2,SZ_f32_exceptional)); + ACL_CHECK(aclrtMallocHost((void**)&h_f32_exceptional_v3,SZ_f32_exceptional)); + ACL_CHECK(aclrtMallocHost((void**)&h_i16_signed_v1,SZ_i16_signed)); + ACL_CHECK(aclrtMallocHost((void**)&h_i16_signed_v2,SZ_i16_signed)); + ACL_CHECK(aclrtMallocHost((void**)&h_i16_signed_v3,SZ_i16_signed)); + ACL_CHECK(aclrtMallocHost((void**)&h_i16_unsigned_v1,SZ_i16_unsigned)); + ACL_CHECK(aclrtMallocHost((void**)&h_i16_unsigned_v2,SZ_i16_unsigned)); + ACL_CHECK(aclrtMallocHost((void**)&h_i16_unsigned_v3,SZ_i16_unsigned)); + ACL_CHECK(aclrtMallocHost((void**)&h_tail_v1,SZ_tail)); + ACL_CHECK(aclrtMallocHost((void**)&h_tail_v2,SZ_tail)); + ACL_CHECK(aclrtMallocHost((void**)&h_tail_v3,SZ_tail)); + ACL_CHECK(aclrtMalloc((void**)&d_f32_v1,SZ_f32,ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc((void**)&d_f32_v2,SZ_f32,ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc((void**)&d_f32_v3,SZ_f32,ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc((void**)&d_f16_v1,SZ_f16,ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc((void**)&d_f16_v2,SZ_f16,ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc((void**)&d_f16_v3,SZ_f16,ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc((void**)&d_bf16_v1,SZ_bf16,ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc((void**)&d_bf16_v2,SZ_bf16,ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc((void**)&d_bf16_v3,SZ_bf16,ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc((void**)&d_f32_exceptional_v1,SZ_f32_exceptional,ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc((void**)&d_f32_exceptional_v2,SZ_f32_exceptional,ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc((void**)&d_f32_exceptional_v3,SZ_f32_exceptional,ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc((void**)&d_i16_signed_v1,SZ_i16_signed,ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc((void**)&d_i16_signed_v2,SZ_i16_signed,ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc((void**)&d_i16_signed_v3,SZ_i16_signed,ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc((void**)&d_i16_unsigned_v1,SZ_i16_unsigned,ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc((void**)&d_i16_unsigned_v2,SZ_i16_unsigned,ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc((void**)&d_i16_unsigned_v3,SZ_i16_unsigned,ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc((void**)&d_tail_v1,SZ_tail,ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc((void**)&d_tail_v2,SZ_tail,ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc((void**)&d_tail_v3,SZ_tail,ACL_MEM_MALLOC_HUGE_FIRST)); + fsz=SZ_f32; FCK(ReadFile("v1.bin",fsz,h_f32_v1,SZ_f32)&&fsz==SZ_f32,"v1.bin"); + fsz=SZ_f32; FCK(ReadFile("v2.bin",fsz,h_f32_v2,SZ_f32)&&fsz==SZ_f32,"v2.bin"); + fsz=SZ_f32; FCK(ReadFile("v3.bin",fsz,h_f32_v3,SZ_f32)&&fsz==SZ_f32,"v3.bin"); + fsz=SZ_f16; FCK(ReadFile("v1_f16.bin",fsz,h_f16_v1,SZ_f16)&&fsz==SZ_f16,"v1_f16.bin"); + fsz=SZ_f16; FCK(ReadFile("v2_f16.bin",fsz,h_f16_v2,SZ_f16)&&fsz==SZ_f16,"v2_f16.bin"); + fsz=SZ_f16; FCK(ReadFile("v3_f16.bin",fsz,h_f16_v3,SZ_f16)&&fsz==SZ_f16,"v3_f16.bin"); + fsz=SZ_bf16; FCK(ReadFile("v1_bf16.bin",fsz,h_bf16_v1,SZ_bf16)&&fsz==SZ_bf16,"v1_bf16.bin"); + fsz=SZ_bf16; FCK(ReadFile("v2_bf16.bin",fsz,h_bf16_v2,SZ_bf16)&&fsz==SZ_bf16,"v2_bf16.bin"); + fsz=SZ_bf16; FCK(ReadFile("v3_bf16.bin",fsz,h_bf16_v3,SZ_bf16)&&fsz==SZ_bf16,"v3_bf16.bin"); + fsz=SZ_f32_exceptional; FCK(ReadFile("v1_f32_exceptional.bin",fsz,h_f32_exceptional_v1,SZ_f32_exceptional)&&fsz==SZ_f32_exceptional,"v1_f32_exceptional.bin"); + fsz=SZ_f32_exceptional; FCK(ReadFile("v2_f32_exceptional.bin",fsz,h_f32_exceptional_v2,SZ_f32_exceptional)&&fsz==SZ_f32_exceptional,"v2_f32_exceptional.bin"); + fsz=SZ_f32_exceptional; FCK(ReadFile("v3_f32_exceptional.bin",fsz,h_f32_exceptional_v3,SZ_f32_exceptional)&&fsz==SZ_f32_exceptional,"v3_f32_exceptional.bin"); + fsz=SZ_i16_signed; FCK(ReadFile("v1_i16_signed.bin",fsz,h_i16_signed_v1,SZ_i16_signed)&&fsz==SZ_i16_signed,"v1_i16_signed.bin"); + fsz=SZ_i16_signed; FCK(ReadFile("v2_i16_signed.bin",fsz,h_i16_signed_v2,SZ_i16_signed)&&fsz==SZ_i16_signed,"v2_i16_signed.bin"); + fsz=SZ_i16_signed; FCK(ReadFile("v3_i16_signed.bin",fsz,h_i16_signed_v3,SZ_i16_signed)&&fsz==SZ_i16_signed,"v3_i16_signed.bin"); + fsz=SZ_i16_unsigned; FCK(ReadFile("v1_i16_unsigned.bin",fsz,h_i16_unsigned_v1,SZ_i16_unsigned)&&fsz==SZ_i16_unsigned,"v1_i16_unsigned.bin"); + fsz=SZ_i16_unsigned; FCK(ReadFile("v2_i16_unsigned.bin",fsz,h_i16_unsigned_v2,SZ_i16_unsigned)&&fsz==SZ_i16_unsigned,"v2_i16_unsigned.bin"); + fsz=SZ_i16_unsigned; FCK(ReadFile("v3_i16_unsigned.bin",fsz,h_i16_unsigned_v3,SZ_i16_unsigned)&&fsz==SZ_i16_unsigned,"v3_i16_unsigned.bin"); + fsz=SZ_tail; FCK(ReadFile("v1_tail.bin",fsz,h_tail_v1,SZ_tail)&&fsz==SZ_tail,"v1_tail.bin"); + fsz=SZ_tail; FCK(ReadFile("v2_tail.bin",fsz,h_tail_v2,SZ_tail)&&fsz==SZ_tail,"v2_tail.bin"); + fsz=SZ_tail; FCK(ReadFile("v3_tail.bin",fsz,h_tail_v3,SZ_tail)&&fsz==SZ_tail,"v3_tail.bin"); + ACL_CHECK(aclrtMemcpy(d_f32_v1,SZ_f32,h_f32_v1,SZ_f32,ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(d_f32_v2,SZ_f32,h_f32_v2,SZ_f32,ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(d_f32_v3,SZ_f32,h_f32_v3,SZ_f32,ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(d_f16_v1,SZ_f16,h_f16_v1,SZ_f16,ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(d_f16_v2,SZ_f16,h_f16_v2,SZ_f16,ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(d_f16_v3,SZ_f16,h_f16_v3,SZ_f16,ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(d_bf16_v1,SZ_bf16,h_bf16_v1,SZ_bf16,ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(d_bf16_v2,SZ_bf16,h_bf16_v2,SZ_bf16,ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(d_bf16_v3,SZ_bf16,h_bf16_v3,SZ_bf16,ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(d_f32_exceptional_v1,SZ_f32_exceptional,h_f32_exceptional_v1,SZ_f32_exceptional,ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(d_f32_exceptional_v2,SZ_f32_exceptional,h_f32_exceptional_v2,SZ_f32_exceptional,ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(d_f32_exceptional_v3,SZ_f32_exceptional,h_f32_exceptional_v3,SZ_f32_exceptional,ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(d_i16_signed_v1,SZ_i16_signed,h_i16_signed_v1,SZ_i16_signed,ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(d_i16_signed_v2,SZ_i16_signed,h_i16_signed_v2,SZ_i16_signed,ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(d_i16_signed_v3,SZ_i16_signed,h_i16_signed_v3,SZ_i16_signed,ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(d_i16_unsigned_v1,SZ_i16_unsigned,h_i16_unsigned_v1,SZ_i16_unsigned,ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(d_i16_unsigned_v2,SZ_i16_unsigned,h_i16_unsigned_v2,SZ_i16_unsigned,ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(d_i16_unsigned_v3,SZ_i16_unsigned,h_i16_unsigned_v3,SZ_i16_unsigned,ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(d_tail_v1,SZ_tail,h_tail_v1,SZ_tail,ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(d_tail_v2,SZ_tail,h_tail_v2,SZ_tail,ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(d_tail_v3,SZ_tail,h_tail_v3,SZ_tail,ACL_MEMCPY_HOST_TO_DEVICE)); + LaunchVminDeepMerged( + d_f32_v1, + d_f32_v2, + d_f32_v3, + d_f32_exceptional_v1, + d_f32_exceptional_v2, + d_f32_exceptional_v3, + d_f16_v1, + d_f16_v2, + d_f16_v3, + d_bf16_v1, + d_bf16_v2, + d_bf16_v3, + d_i16_signed_v1, + d_i16_signed_v2, + d_i16_signed_v3, + d_i16_unsigned_v1, + d_i16_unsigned_v2, + d_i16_unsigned_v3, + d_tail_v1, + d_tail_v2, + d_tail_v3, + stream + ); ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v3Host, fileSize_v3, v3Device, fileSize_v3, - ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v3.bin", v3Host, fileSize_v3); + ACL_CHECK(aclrtMemcpy(h_f32_v3,SZ_f32,d_f32_v3,SZ_f32,ACL_MEMCPY_DEVICE_TO_HOST)); + ACL_CHECK(aclrtMemcpy(h_f16_v3,SZ_f16,d_f16_v3,SZ_f16,ACL_MEMCPY_DEVICE_TO_HOST)); + ACL_CHECK(aclrtMemcpy(h_bf16_v3,SZ_bf16,d_bf16_v3,SZ_bf16,ACL_MEMCPY_DEVICE_TO_HOST)); + ACL_CHECK(aclrtMemcpy(h_f32_exceptional_v3,SZ_f32_exceptional,d_f32_exceptional_v3,SZ_f32_exceptional,ACL_MEMCPY_DEVICE_TO_HOST)); + ACL_CHECK(aclrtMemcpy(h_i16_signed_v3,SZ_i16_signed,d_i16_signed_v3,SZ_i16_signed,ACL_MEMCPY_DEVICE_TO_HOST)); + ACL_CHECK(aclrtMemcpy(h_i16_unsigned_v3,SZ_i16_unsigned,d_i16_unsigned_v3,SZ_i16_unsigned,ACL_MEMCPY_DEVICE_TO_HOST)); + ACL_CHECK(aclrtMemcpy(h_tail_v3,SZ_tail,d_tail_v3,SZ_tail,ACL_MEMCPY_DEVICE_TO_HOST)); + FCK(WriteFile("v3.bin",h_f32_v3,SZ_f32),"v3.bin"); + FCK(WriteFile("v3_f16.bin",h_f16_v3,SZ_f16),"v3_f16.bin"); + FCK(WriteFile("v3_bf16.bin",h_bf16_v3,SZ_bf16),"v3_bf16.bin"); + FCK(WriteFile("v3_f32_exceptional.bin",h_f32_exceptional_v3,SZ_f32_exceptional),"v3_f32_exceptional.bin"); + FCK(WriteFile("v3_i16_signed.bin",h_i16_signed_v3,SZ_i16_signed),"v3_i16_signed.bin"); + FCK(WriteFile("v3_i16_unsigned.bin",h_i16_unsigned_v3,SZ_i16_unsigned),"v3_i16_unsigned.bin"); + FCK(WriteFile("v3_tail.bin",h_tail_v3,SZ_tail),"v3_tail.bin"); cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFree(v3Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - aclrtFreeHost(v3Host); - if (stream != nullptr) - aclrtDestroyStream(stream); - if (deviceSet) - aclrtResetDevice(deviceId); - if (aclInited) - aclFinalize(); + aclrtFree(d_f32_v1); + aclrtFree(d_f32_v2); + aclrtFree(d_f32_v3); + aclrtFree(d_f16_v1); + aclrtFree(d_f16_v2); + aclrtFree(d_f16_v3); + aclrtFree(d_bf16_v1); + aclrtFree(d_bf16_v2); + aclrtFree(d_bf16_v3); + aclrtFree(d_f32_exceptional_v1); + aclrtFree(d_f32_exceptional_v2); + aclrtFree(d_f32_exceptional_v3); + aclrtFree(d_i16_signed_v1); + aclrtFree(d_i16_signed_v2); + aclrtFree(d_i16_signed_v3); + aclrtFree(d_i16_unsigned_v1); + aclrtFree(d_i16_unsigned_v2); + aclrtFree(d_i16_unsigned_v3); + aclrtFree(d_tail_v1); + aclrtFree(d_tail_v2); + aclrtFree(d_tail_v3); + aclrtFreeHost(h_f32_v1); + aclrtFreeHost(h_f32_v2); + aclrtFreeHost(h_f32_v3); + aclrtFreeHost(h_f16_v1); + aclrtFreeHost(h_f16_v2); + aclrtFreeHost(h_f16_v3); + aclrtFreeHost(h_bf16_v1); + aclrtFreeHost(h_bf16_v2); + aclrtFreeHost(h_bf16_v3); + aclrtFreeHost(h_f32_exceptional_v1); + aclrtFreeHost(h_f32_exceptional_v2); + aclrtFreeHost(h_f32_exceptional_v3); + aclrtFreeHost(h_i16_signed_v1); + aclrtFreeHost(h_i16_signed_v2); + aclrtFreeHost(h_i16_signed_v3); + aclrtFreeHost(h_i16_unsigned_v1); + aclrtFreeHost(h_i16_unsigned_v2); + aclrtFreeHost(h_i16_unsigned_v3); + aclrtFreeHost(h_tail_v1); + aclrtFreeHost(h_tail_v2); + aclrtFreeHost(h_tail_v3); + if(stream) aclrtDestroyStream(stream); + if(deviceSet) aclrtResetDevice(deviceId); + if(aclInited) aclFinalize(); return rc; } diff --git a/test/vpto/cases/micro-op/binary-vector/vmul-tail/compare.py b/test/vpto/cases/micro-op/binary-vector/vmul-tail/compare.py deleted file mode 100644 index c95419953..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vmul-tail/compare.py +++ /dev/null @@ -1,39 +0,0 @@ -#!/usr/bin/python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - - -import os -import sys -import numpy as np - - -def compare_bin_prefix(golden_path, output_path, dtype, eps, count): - if not os.path.exists(golden_path) or not os.path.exists(output_path): - return False - golden = np.fromfile(golden_path, dtype=dtype, count=count) - output = np.fromfile(output_path, dtype=dtype, count=count) - return golden.shape == output.shape and np.allclose( - golden, output, atol=eps, rtol=eps, equal_nan=True - ) - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_bin_prefix("golden_v3.bin", "v3.bin", np.float32, 1e-4, 1000) - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/binary-vector/vmul-tail/golden.py b/test/vpto/cases/micro-op/binary-vector/vmul-tail/golden.py deleted file mode 100644 index 553faae15..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vmul-tail/golden.py +++ /dev/null @@ -1,51 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -ROWS = 32 -COLS = 32 -SEED = 19 -LOGICAL_ELEMS = 1000 -OUT_SENTINEL = np.float32(-123.25) - - -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) - v1 = rng.random((ROWS, COLS), dtype=np.float32) - v2 = rng.random((ROWS, COLS), dtype=np.float32) - v3 = np.full((ROWS, COLS), OUT_SENTINEL, dtype=np.float32) - golden_v3 = np.full((ROWS, COLS), OUT_SENTINEL, dtype=np.float32) - golden_v3.reshape(-1)[:LOGICAL_ELEMS] = ( - v1.reshape(-1)[:LOGICAL_ELEMS] * v2.reshape(-1)[:LOGICAL_ELEMS] - ).astype(np.float32, copy=False) - - output_dir.mkdir(parents=True, exist_ok=True) - v1.reshape(-1).tofile(output_dir / "v1.bin") - v2.reshape(-1).tofile(output_dir / "v2.bin") - v3.reshape(-1).tofile(output_dir / "v3.bin") - golden_v3.reshape(-1).tofile(output_dir / "golden_v3.bin") - - -def main() -> None: - parser = argparse.ArgumentParser() - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/binary-vector/vmul-tail/kernel.pto b/test/vpto/cases/micro-op/binary-vector/vmul-tail/kernel.pto deleted file mode 100644 index 68675fb11..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vmul-tail/kernel.pto +++ /dev/null @@ -1,49 +0,0 @@ -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vmul_tail_kernel_2d(%arg0: !pto.ptr, %arg1: !pto.ptr, - %arg2: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c64 = arith.constant 64 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c128_i64 = arith.constant 128 : i64 - %c4096_i64 = arith.constant 4096 : i64 - %c8192_i64 = arith.constant 8192 : i64 - %c1000_i32 = arith.constant 1000 : i32 - - %ub_lhs = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_rhs = pto.castptr %c4096_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c8192_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_lhs, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - pto.mte_gm_ub %arg1, %ub_rhs, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %_:1 = scf.for %offset = %c0 to %c1024 step %c64 iter_args(%remaining = %c1000_i32) -> (i32) { - %mask, %next_remaining = pto.plt_b32 %remaining : i32 -> !pto.mask, i32 - %lhs = pto.vlds %ub_lhs[%offset] : !pto.ptr -> !pto.vreg<64xf32> - %rhs = pto.vlds %ub_rhs[%offset] : !pto.ptr -> !pto.vreg<64xf32> - %prod = pto.vmul %lhs, %rhs, %mask : !pto.vreg<64xf32>, !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> - pto.vsts %prod, %ub_out[%offset], %mask : !pto.vreg<64xf32>, !pto.ptr, !pto.mask - scf.yield %next_remaining : i32 - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg2, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/binary-vector/vmul-tail/launch.cpp b/test/vpto/cases/micro-op/binary-vector/vmul-tail/launch.cpp deleted file mode 100644 index 6e1ab54ae..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vmul-tail/launch.cpp +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vmul_tail_kernel_2d(__gm__ float *v1, - __gm__ float *v2, - __gm__ float *v3); - -void LaunchVadd_tail_kernel_2d(float *v1, float *v2, float *v3, void *stream) { - vmul_tail_kernel_2d<<<1, nullptr, stream>>>((__gm__ float *)v1, - (__gm__ float *)v2, - (__gm__ float *)v3); -} diff --git a/test/vpto/cases/micro-op/binary-vector/vmul-tail/main.cpp b/test/vpto/cases/micro-op/binary-vector/vmul-tail/main.cpp deleted file mode 100644 index 40a9881d6..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vmul-tail/main.cpp +++ /dev/null @@ -1,94 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include - -using namespace PtoTestCommon; - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, \ - (int)_ret, __FILE__, __LINE__); \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVadd_tail_kernel_2d(float *v1, float *v2, float *v3, void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(float); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(float); - size_t elemCount_v3 = 1024; - size_t fileSize_v3 = elemCount_v3 * sizeof(float); - float *v1Host = nullptr; - float *v1Device = nullptr; - float *v2Host = nullptr; - float *v2Device = nullptr; - float *v3Host = nullptr; - float *v3Device = nullptr; - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) - deviceId = std::atoi(envDevice); - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMallocHost((void **)(&v3Host), fileSize_v3)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v3Device, fileSize_v3, ACL_MEM_MALLOC_HUGE_FIRST)); - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ReadFile("./v3.bin", fileSize_v3, v3Host, fileSize_v3); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v3Device, fileSize_v3, v3Host, fileSize_v3, - ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVadd_tail_kernel_2d(v1Device, v2Device, v3Device, stream); - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v3Host, fileSize_v3, v3Device, fileSize_v3, - ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v3.bin", v3Host, fileSize_v3); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFree(v3Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - aclrtFreeHost(v3Host); - if (stream != nullptr) - aclrtDestroyStream(stream); - if (deviceSet) - aclrtResetDevice(deviceId); - if (aclInited) - aclFinalize(); - return rc; -} diff --git a/test/vpto/cases/micro-op/binary-vector/vmul/compare.py b/test/vpto/cases/micro-op/binary-vector/vmul/compare.py deleted file mode 100644 index a5f14dabc..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vmul/compare.py +++ /dev/null @@ -1,37 +0,0 @@ -#!/usr/bin/python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - - -import os -import sys -import numpy as np - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(golden_path) or not os.path.exists(output_path): - return False - golden = np.fromfile(golden_path, dtype=dtype) - output = np.fromfile(output_path, dtype=dtype) - return golden.shape == output.shape and np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True) - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_bin("golden_v3.bin", "v3.bin", np.float32, 1e-4) - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/binary-vector/vmul/golden.py b/test/vpto/cases/micro-op/binary-vector/vmul/golden.py deleted file mode 100644 index 23e4731e7..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vmul/golden.py +++ /dev/null @@ -1,46 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -ROWS = 32 -COLS = 32 -SEED = 19 - - -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) - v1 = rng.uniform(-4.0, 4.0, size=(ROWS, COLS)).astype(np.float32) - v2 = rng.uniform(-4.0, 4.0, size=(ROWS, COLS)).astype(np.float32) - golden_v3 = (v1 * v2).astype(np.float32, copy=False) - v3 = np.zeros((ROWS, COLS), dtype=np.float32) - - output_dir.mkdir(parents=True, exist_ok=True) - v1.reshape(-1).tofile(output_dir / "v1.bin") - v2.reshape(-1).tofile(output_dir / "v2.bin") - v3.reshape(-1).tofile(output_dir / "v3.bin") - golden_v3.reshape(-1).tofile(output_dir / "golden_v3.bin") - - -def main() -> None: - parser = argparse.ArgumentParser() - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/binary-vector/vmul/kernel.pto b/test/vpto/cases/micro-op/binary-vector/vmul/kernel.pto deleted file mode 100644 index f42d8a759..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vmul/kernel.pto +++ /dev/null @@ -1,50 +0,0 @@ -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @mul_kernel_2d(%arg0: !pto.ptr, %arg1: !pto.ptr, - %arg2: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c64 = arith.constant 64 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c128_i64 = arith.constant 128 : i64 - %c4096_i64 = arith.constant 4096 : i64 - %c8192_i64 = arith.constant 8192 : i64 - %c1024_i32 = arith.constant 1024 : i32 - - %ub_lhs = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_rhs = pto.castptr %c4096_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c8192_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_lhs, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - pto.mte_gm_ub %arg1, %ub_rhs, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %_:1 = scf.for %offset = %c0 to %c1024 step %c64 iter_args(%remaining = %c1024_i32) -> (i32) { - %mask, %next_remaining = pto.plt_b32 %remaining : i32 -> !pto.mask, i32 - %lhs = pto.vlds %ub_lhs[%offset] : !pto.ptr -> !pto.vreg<64xf32> - %rhs = pto.vlds %ub_rhs[%offset] : !pto.ptr -> !pto.vreg<64xf32> - %prod = pto.vmul %lhs, %rhs, %mask : !pto.vreg<64xf32>, !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> - pto.vsts %prod, %ub_out[%offset], %mask : !pto.vreg<64xf32>, !pto.ptr, !pto.mask - scf.yield %next_remaining : i32 - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg2, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/binary-vector/vmul/launch.cpp b/test/vpto/cases/micro-op/binary-vector/vmul/launch.cpp deleted file mode 100644 index 21ee4384c..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vmul/launch.cpp +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void mul_kernel_2d(__gm__ float *v1, - __gm__ float *v2, - __gm__ float *v3); - -void LaunchMul_kernel_2d(float *v1, float *v2, float *v3, void *stream) { - mul_kernel_2d<<<1, nullptr, stream>>>((__gm__ float *)v1, - (__gm__ float *)v2, - (__gm__ float *)v3); -} diff --git a/test/vpto/cases/micro-op/binary-vector/vmul/main.cpp b/test/vpto/cases/micro-op/binary-vector/vmul/main.cpp deleted file mode 100644 index 711269796..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vmul/main.cpp +++ /dev/null @@ -1,94 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include - -using namespace PtoTestCommon; - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, \ - (int)_ret, __FILE__, __LINE__); \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchMul_kernel_2d(float *v1, float *v2, float *v3, void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(float); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(float); - size_t elemCount_v3 = 1024; - size_t fileSize_v3 = elemCount_v3 * sizeof(float); - float *v1Host = nullptr; - float *v1Device = nullptr; - float *v2Host = nullptr; - float *v2Device = nullptr; - float *v3Host = nullptr; - float *v3Device = nullptr; - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) - deviceId = std::atoi(envDevice); - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMallocHost((void **)(&v3Host), fileSize_v3)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v3Device, fileSize_v3, ACL_MEM_MALLOC_HUGE_FIRST)); - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ReadFile("./v3.bin", fileSize_v3, v3Host, fileSize_v3); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v3Device, fileSize_v3, v3Host, fileSize_v3, - ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchMul_kernel_2d(v1Device, v2Device, v3Device, stream); - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v3Host, fileSize_v3, v3Device, fileSize_v3, - ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v3.bin", v3Host, fileSize_v3); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFree(v3Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - aclrtFreeHost(v3Host); - if (stream != nullptr) - aclrtDestroyStream(stream); - if (deviceSet) - aclrtResetDevice(deviceId); - if (aclInited) - aclFinalize(); - return rc; -} diff --git a/test/vpto/cases/micro-op/binary-vector/vor-f16/compare.py b/test/vpto/cases/micro-op/binary-vector/vor-f16/compare.py deleted file mode 100755 index 78bd43ef6..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vor-f16/compare.py +++ /dev/null @@ -1,44 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/binary-vector/vor-f16 -# family: binary-vector -# target_ops: pto.vor -# scenarios: core-f16, full-mask -# NOTE: bulk-generated coverage skeleton. - -import os -import sys -import numpy as np - - -def compare_bin_prefix(golden_path, output_path, dtype, eps, count): - if not os.path.exists(golden_path) or not os.path.exists(output_path): - return False - golden = np.fromfile(golden_path, dtype=dtype, count=count) - output = np.fromfile(output_path, dtype=dtype, count=count) - return golden.shape == output.shape and np.allclose( - golden, output, atol=eps, rtol=eps, equal_nan=True - ) - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_bin_prefix("golden_v3.bin", "v3.bin", np.uint16, 0, 1024) - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/binary-vector/vor-f16/golden.py b/test/vpto/cases/micro-op/binary-vector/vor-f16/golden.py deleted file mode 100755 index 471da5094..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vor-f16/golden.py +++ /dev/null @@ -1,60 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/binary-vector/vor-f16 -# family: binary-vector -# target_ops: pto.vor -# scenarios: core-f16, full-mask -# NOTE: bulk-generated coverage skeleton. -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -ELEMS = 1024 -SEED = 19 - - -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) - bits1 = rng.integers(0, 0x10000, size=ELEMS, dtype=np.uint16) - bits2 = rng.integers(0, 0x10000, size=ELEMS, dtype=np.uint16) - bits1[:8] = np.array( - [0x0000, 0x8000, 0x3c00, 0xbc00, 0x7c00, 0xfc00, 0x7e00, 0x3555], - dtype=np.uint16, - ) - bits2[:8] = np.array( - [0x0001, 0x0001, 0x4000, 0x2000, 0x0001, 0x0001, 0x0100, 0x0aaa], - dtype=np.uint16, - ) - v1 = bits1.view(np.float16) - v2 = bits2.view(np.float16) - v3 = np.zeros(ELEMS, dtype=np.float16) - golden_v3 = np.bitwise_or(bits1, bits2).view(np.float16) - - output_dir.mkdir(parents=True, exist_ok=True) - v1.tofile(output_dir / "v1.bin") - v2.tofile(output_dir / "v2.bin") - v3.tofile(output_dir / "v3.bin") - golden_v3.tofile(output_dir / "golden_v3.bin") - - -def main() -> None: - parser = argparse.ArgumentParser() - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/binary-vector/vor-f16/kernel.pto b/test/vpto/cases/micro-op/binary-vector/vor-f16/kernel.pto deleted file mode 100644 index d5b6351d9..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vor-f16/kernel.pto +++ /dev/null @@ -1,55 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/binary-vector/vor-f16 -// family: binary-vector -// target_ops: pto.vor -// scenarios: core-f16, full-mask -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vor_f16_kernel(%arg0: !pto.ptr, %arg1: !pto.ptr, - %arg2: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1024 = arith.constant 1024 : index - %c128 = arith.constant 128 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c64_i64 = arith.constant 64 : i64 - %c2048_i64 = arith.constant 2048 : i64 - %c4096_i64 = arith.constant 4096 : i64 - - %ub_lhs = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_rhs = pto.castptr %c2048_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c4096_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_lhs, %c0_i64, %c64_i64 - nburst(%c32_i64, %c64_i64, %c64_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - pto.mte_gm_ub %arg1, %ub_rhs, %c0_i64, %c64_i64 - nburst(%c32_i64, %c64_i64, %c64_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %mask = pto.pset_b16 "PAT_ALL" : !pto.mask - scf.for %offset = %c0 to %c1024 step %c128 { - %lhs = pto.vlds %ub_lhs[%offset] : !pto.ptr -> !pto.vreg<128xf16> - %rhs = pto.vlds %ub_rhs[%offset] : !pto.ptr -> !pto.vreg<128xf16> - %out = pto.vor %lhs, %rhs, %mask : !pto.vreg<128xf16>, !pto.vreg<128xf16>, !pto.mask -> !pto.vreg<128xf16> - pto.vsts %out, %ub_out[%offset], %mask : !pto.vreg<128xf16>, !pto.ptr, !pto.mask - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg2, %c64_i64 - nburst(%c32_i64, %c64_i64, %c64_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/binary-vector/vor-f16/launch.cpp b/test/vpto/cases/micro-op/binary-vector/vor-f16/launch.cpp deleted file mode 100644 index 45b41406e..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vor-f16/launch.cpp +++ /dev/null @@ -1,54 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/binary-vector/vor-f16 -// family: binary-vector -// target_ops: pto.vor -// scenarios: core-f16, full-mask -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vor_f16_kernel(__gm__ half *v1, - __gm__ half *v2, - __gm__ half *v3); - -void LaunchVor_f16_kernel(uint16_t *v1, uint16_t *v2, uint16_t *v3, - void *stream) { - vor_f16_kernel<<<1, nullptr, stream>>>((__gm__ half *)v1, (__gm__ half *)v2, - (__gm__ half *)v3); -} diff --git a/test/vpto/cases/micro-op/binary-vector/vor-f16/main.cpp b/test/vpto/cases/micro-op/binary-vector/vor-f16/main.cpp deleted file mode 100644 index 826735922..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vor-f16/main.cpp +++ /dev/null @@ -1,103 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/binary-vector/vor-f16 -// family: binary-vector -// target_ops: pto.vor -// scenarios: core-f16, full-mask -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include - -using namespace PtoTestCommon; - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, \ - (int)_ret, __FILE__, __LINE__); \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVor_f16_kernel(uint16_t *v1, uint16_t *v2, uint16_t *v3, - void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(uint16_t); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(uint16_t); - size_t elemCount_v3 = 1024; - size_t fileSize_v3 = elemCount_v3 * sizeof(uint16_t); - uint16_t *v1Host = nullptr; - uint16_t *v1Device = nullptr; - uint16_t *v2Host = nullptr; - uint16_t *v2Device = nullptr; - uint16_t *v3Host = nullptr; - uint16_t *v3Device = nullptr; - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) - deviceId = std::atoi(envDevice); - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMallocHost((void **)(&v3Host), fileSize_v3)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v3Device, fileSize_v3, ACL_MEM_MALLOC_HUGE_FIRST)); - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ReadFile("./v3.bin", fileSize_v3, v3Host, fileSize_v3); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v3Device, fileSize_v3, v3Host, fileSize_v3, - ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVor_f16_kernel(v1Device, v2Device, v3Device, stream); - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v3Host, fileSize_v3, v3Device, fileSize_v3, - ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v3.bin", v3Host, fileSize_v3); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFree(v3Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - aclrtFreeHost(v3Host); - if (stream != nullptr) - aclrtDestroyStream(stream); - if (deviceSet) - aclrtResetDevice(deviceId); - if (aclInited) - aclFinalize(); - return rc; -} diff --git a/test/vpto/cases/micro-op/binary-vector/vor-mask-edge/compare.py b/test/vpto/cases/micro-op/binary-vector/vor-mask-edge/compare.py deleted file mode 100755 index 58ac20c66..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vor-mask-edge/compare.py +++ /dev/null @@ -1,44 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/binary-vector/vor-mask-edge -# family: binary-vector -# target_ops: pto.vor -# scenarios: core-i16-unsigned, full-mask -# NOTE: bulk-generated coverage skeleton. - -import os -import sys -import numpy as np - - -def compare_bin_prefix(golden_path, output_path, dtype, eps, count): - if not os.path.exists(golden_path) or not os.path.exists(output_path): - return False - golden = np.fromfile(golden_path, dtype=dtype, count=count) - output = np.fromfile(output_path, dtype=dtype, count=count) - return golden.shape == output.shape and np.allclose( - golden, output, atol=eps, rtol=eps, equal_nan=True - ) - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_bin_prefix("golden_v3.bin", "v3.bin", np.uint16, 0, 1024) - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/binary-vector/vor-mask-edge/golden.py b/test/vpto/cases/micro-op/binary-vector/vor-mask-edge/golden.py deleted file mode 100755 index 3c28fa036..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vor-mask-edge/golden.py +++ /dev/null @@ -1,50 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/binary-vector/vor-mask-edge -# family: binary-vector -# target_ops: pto.vor -# scenarios: core-i16-unsigned, full-mask -# NOTE: bulk-generated coverage skeleton. -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -ELEMS = 1024 -SEED = 19 - - -def generate(output_dir: Path, seed: int) -> None: - idx = np.arange(ELEMS, dtype=np.uint16) - v1 = np.where((idx & 1) == 0, np.uint16(0xAAAA), np.uint16(0x0F0F)).astype(np.uint16, copy=False) - v2 = np.where((idx & 2) == 0, np.uint16(0x5555), np.uint16(0x3333)).astype(np.uint16, copy=False) - v3 = np.zeros(ELEMS, dtype=np.uint16) - golden_v3 = np.bitwise_or(v1, v2).astype(np.uint16, copy=False) - - output_dir.mkdir(parents=True, exist_ok=True) - v1.tofile(output_dir / "v1.bin") - v2.tofile(output_dir / "v2.bin") - v3.tofile(output_dir / "v3.bin") - golden_v3.tofile(output_dir / "golden_v3.bin") - - -def main() -> None: - parser = argparse.ArgumentParser() - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/binary-vector/vor-mask-edge/kernel.pto b/test/vpto/cases/micro-op/binary-vector/vor-mask-edge/kernel.pto deleted file mode 100644 index c964b2940..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vor-mask-edge/kernel.pto +++ /dev/null @@ -1,55 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/binary-vector/vor-mask-edge -// family: binary-vector -// target_ops: pto.vor -// scenarios: core-i16-unsigned, full-mask -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vor_mask_edge_kernel(%arg0: !pto.ptr, %arg1: !pto.ptr, - %arg2: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c64_i64 = arith.constant 64 : i64 - %c1024 = arith.constant 1024 : index - %c128 = arith.constant 128 : index - %c2048_i64 = arith.constant 2048 : i64 - %c4096_i64 = arith.constant 4096 : i64 - - %ub_lhs = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_rhs = pto.castptr %c2048_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c4096_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_lhs, %c0_i64, %c64_i64 - nburst(%c32_i64, %c64_i64, %c64_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - pto.mte_gm_ub %arg1, %ub_rhs, %c0_i64, %c64_i64 - nburst(%c32_i64, %c64_i64, %c64_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %mask = pto.pset_b16 "PAT_ALL" : !pto.mask - scf.for %offset = %c0 to %c1024 step %c128 { - %lhs = pto.vlds %ub_lhs[%offset] : !pto.ptr -> !pto.vreg<128xui16> - %rhs = pto.vlds %ub_rhs[%offset] : !pto.ptr -> !pto.vreg<128xui16> - %out = pto.vor %lhs, %rhs, %mask : !pto.vreg<128xui16>, !pto.vreg<128xui16>, !pto.mask -> !pto.vreg<128xui16> - pto.vsts %out, %ub_out[%offset], %mask : !pto.vreg<128xui16>, !pto.ptr, !pto.mask - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg2, %c64_i64 - nburst(%c32_i64, %c64_i64, %c64_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/binary-vector/vor-mask-edge/launch.cpp b/test/vpto/cases/micro-op/binary-vector/vor-mask-edge/launch.cpp deleted file mode 100644 index c9e9411c6..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vor-mask-edge/launch.cpp +++ /dev/null @@ -1,55 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/binary-vector/vor-mask-edge -// family: binary-vector -// target_ops: pto.vor -// scenarios: core-i16-unsigned, full-mask -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vor_mask_edge_kernel(__gm__ uint16_t *v1, - __gm__ uint16_t *v2, - __gm__ uint16_t *v3); - -void LaunchVor_mask_edge_kernel(uint16_t *v1, uint16_t *v2, uint16_t *v3, - void *stream) { - vor_mask_edge_kernel<<<1, nullptr, stream>>>((__gm__ uint16_t *)v1, - (__gm__ uint16_t *)v2, - (__gm__ uint16_t *)v3); -} diff --git a/test/vpto/cases/micro-op/binary-vector/vor-mask-edge/main.cpp b/test/vpto/cases/micro-op/binary-vector/vor-mask-edge/main.cpp deleted file mode 100644 index 634ad7664..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vor-mask-edge/main.cpp +++ /dev/null @@ -1,103 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/binary-vector/vor-mask-edge -// family: binary-vector -// target_ops: pto.vor -// scenarios: core-i16-unsigned, full-mask -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include - -using namespace PtoTestCommon; - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, \ - (int)_ret, __FILE__, __LINE__); \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVor_mask_edge_kernel(uint16_t *v1, uint16_t *v2, uint16_t *v3, - void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(uint16_t); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(uint16_t); - size_t elemCount_v3 = 1024; - size_t fileSize_v3 = elemCount_v3 * sizeof(uint16_t); - uint16_t *v1Host = nullptr; - uint16_t *v1Device = nullptr; - uint16_t *v2Host = nullptr; - uint16_t *v2Device = nullptr; - uint16_t *v3Host = nullptr; - uint16_t *v3Device = nullptr; - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) - deviceId = std::atoi(envDevice); - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMallocHost((void **)(&v3Host), fileSize_v3)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v3Device, fileSize_v3, ACL_MEM_MALLOC_HUGE_FIRST)); - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ReadFile("./v3.bin", fileSize_v3, v3Host, fileSize_v3); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v3Device, fileSize_v3, v3Host, fileSize_v3, - ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVor_mask_edge_kernel(v1Device, v2Device, v3Device, stream); - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v3Host, fileSize_v3, v3Device, fileSize_v3, - ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v3.bin", v3Host, fileSize_v3); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFree(v3Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - aclrtFreeHost(v3Host); - if (stream != nullptr) - aclrtDestroyStream(stream); - if (deviceSet) - aclrtResetDevice(deviceId); - if (aclInited) - aclFinalize(); - return rc; -} diff --git a/test/vpto/cases/micro-op/binary-vector/vor/compare.py b/test/vpto/cases/micro-op/binary-vector/vor/compare.py index 8b38a30b8..9474aaa97 100755 --- a/test/vpto/cases/micro-op/binary-vector/vor/compare.py +++ b/test/vpto/cases/micro-op/binary-vector/vor/compare.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python3 +#!/usr/bin/python3 # Copyright (c) 2026 Huawei Technologies Co., Ltd. # This program is free software, you can redistribute it and/or modify it under the terms and conditions of # CANN Open Software License Agreement Version 2.0 (the "License"). @@ -7,38 +7,47 @@ # INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. # See LICENSE in the root of the software repository for the full text of the License. -# case: micro-op/binary-vector/vor -# family: binary-vector -# target_ops: pto.vor -# scenarios: core-i16-unsigned, full-mask -# NOTE: bulk-generated coverage skeleton. +# Merged vor test case. -import os -import sys +import os,sys import numpy as np +def _cmp(golden,output,dtype,eps,count=-1): + if not os.path.exists(golden) or not os.path.exists(output): return False + kw={} if count<0 else {"count":count} + g=np.fromfile(golden,dtype=dtype,**kw) + o=np.fromfile(output,dtype=dtype,**kw) + return g.shape==o.shape and np.allclose(g,o,atol=eps,rtol=eps,equal_nan=True) -def compare_bin_prefix(golden_path, output_path, dtype, eps, count): - if not os.path.exists(golden_path) or not os.path.exists(output_path): - return False - golden = np.fromfile(golden_path, dtype=dtype, count=count) - output = np.fromfile(output_path, dtype=dtype, count=count) - return golden.shape == output.shape and np.allclose( - golden, output, atol=eps, rtol=eps, equal_nan=True - ) - +def _cmpeq(golden,output,dtype): + if not os.path.exists(golden) or not os.path.exists(output): return False + g=np.fromfile(golden,dtype=dtype) + o=np.fromfile(output,dtype=dtype) + return g.shape==o.shape and np.array_equal(g,o) def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_bin_prefix("golden_v3.bin", "v3.bin", np.uint16, 0, 1024) - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") + strict=os.getenv('COMPARE_STRICT','1')!='0' + failed=[] + if not (_cmp("golden_v3.bin","v3.bin",np.uint16,0,1024)): + failed.append('f32') + print('[ERROR] compare failed: f32') + else: + print('[INFO] f32: passed') + if not (_cmp("golden_v3_f16.bin","v3_f16.bin",np.uint16,0,1024)): + failed.append('f16') + print('[ERROR] compare failed: f16') + else: + print('[INFO] f16: passed') + if not (_cmpeq("golden_v3_mask_edge.bin","v3_mask_edge.bin",np.uint16)): + failed.append('mask_edge') + print('[ERROR] compare failed: mask_edge') + else: + print('[INFO] mask_edge: passed') + if failed: + if strict: print(f"[ERROR] {len(failed)} variant(s) failed"); sys.exit(2) + print(f"[WARN] {len(failed)} variant(s) failed (non-gating)") return - print("[INFO] compare passed") - + print("[INFO] compare passed (all 3 variants)") -if __name__ == "__main__": +if __name__=="__main__": main() diff --git a/test/vpto/cases/micro-op/binary-vector/vor/golden.py b/test/vpto/cases/micro-op/binary-vector/vor/golden.py index c0d7ce117..89720bc15 100755 --- a/test/vpto/cases/micro-op/binary-vector/vor/golden.py +++ b/test/vpto/cases/micro-op/binary-vector/vor/golden.py @@ -7,44 +7,74 @@ # INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. # See LICENSE in the root of the software repository for the full text of the License. -# case: micro-op/binary-vector/vor -# family: binary-vector -# target_ops: pto.vor -# scenarios: core-i16-unsigned, full-mask -# NOTE: bulk-generated coverage skeleton. -# coding=utf-8 +# Merged vor test case. import argparse from pathlib import Path - import numpy as np - -ELEMS = 1024 +ROWS = 32 +COLS = 32 SEED = 19 +LOGICAL_ELEMS = 1000 +OUT_SENTINEL = np.float32(-123.25) + +def f32_to_bf16_bits(v): + w=v.astype(np.float32,copy=False).view(np.uint32) + r=np.uint32(0x7FFF)+((w>>16)&np.uint32(1)) + return ((w+r)>>16).astype(np.uint16) +def bf16_bits_to_f32(b): + return (b.astype(np.uint32)<<16).view(np.float32) -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) - v1 = rng.integers(0, 0x10000, size=ELEMS, dtype=np.uint16) - v2 = rng.integers(0, 0x10000, size=ELEMS, dtype=np.uint16) - v3 = np.zeros(ELEMS, dtype=np.uint16) - golden_v3 = np.bitwise_or(v1, v2).astype(np.uint16, copy=False) +# ---- f32 ---- +def gen_f32(out, rng): + v1=rng.integers(0,0x10000,size=(ROWS,COLS),dtype=np.uint16) + v2=rng.integers(0,0x10000,size=(ROWS,COLS),dtype=np.uint16) + g=np.bitwise_or(v1,v2) + v3=np.zeros((ROWS,COLS),dtype=np.uint16) + v1.reshape(-1).tofile(out/"v1.bin") + v2.reshape(-1).tofile(out/"v2.bin") + v3.reshape(-1).tofile(out/"v3.bin") + g.reshape(-1).tofile(out/"golden_v3.bin") - output_dir.mkdir(parents=True, exist_ok=True) - v1.tofile(output_dir / "v1.bin") - v2.tofile(output_dir / "v2.bin") - v3.tofile(output_dir / "v3.bin") - golden_v3.tofile(output_dir / "golden_v3.bin") +# ---- f16 ---- +def gen_f16(out, rng): + v1=rng.integers(0,0x10000,size=(ROWS,COLS),dtype=np.uint16) + v2=rng.integers(0,0x10000,size=(ROWS,COLS),dtype=np.uint16) + g=np.bitwise_or(v1,v2) + v3=np.zeros((ROWS,COLS),dtype=np.uint16) + v1.reshape(-1).tofile(out/"v1_f16.bin") + v2.reshape(-1).tofile(out/"v2_f16.bin") + v3.reshape(-1).tofile(out/"v3_f16.bin") + g.reshape(-1).tofile(out/"golden_v3_f16.bin") +# ---- mask_edge ---- +def gen_mask_edge(out, rng): + v1=rng.integers(0,0x10000,size=(ROWS,COLS),dtype=np.uint16) + v2=rng.integers(0,0x10000,size=(ROWS,COLS),dtype=np.uint16) + g=np.bitwise_or(v1,v2) + v3=np.zeros((ROWS,COLS),dtype=np.uint16) + v1.reshape(-1).tofile(out/"v1_mask_edge.bin") + v2.reshape(-1).tofile(out/"v2_mask_edge.bin") + v3.reshape(-1).tofile(out/"v3_mask_edge.bin") + g.reshape(-1).tofile(out/"golden_v3_mask_edge.bin") -def main() -> None: - parser = argparse.ArgumentParser() - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) +GENERATORS = [ + gen_f32, + gen_f16, + gen_mask_edge, +] +def main(): + p=argparse.ArgumentParser() + p.add_argument("--output-dir",type=Path,default=Path(".")) + p.add_argument("--seed",type=int,default=SEED) + a=p.parse_args() + rng=np.random.default_rng(a.seed) + out=a.output_dir; out.mkdir(parents=True,exist_ok=True) + for gen in GENERATORS: + gen(out,rng) -if __name__ == "__main__": +if __name__=="__main__": main() diff --git a/test/vpto/cases/micro-op/binary-vector/vor/kernel.pto b/test/vpto/cases/micro-op/binary-vector/vor/kernel.pto index 3ce40ada7..c807d07df 100644 --- a/test/vpto/cases/micro-op/binary-vector/vor/kernel.pto +++ b/test/vpto/cases/micro-op/binary-vector/vor/kernel.pto @@ -1,55 +1,137 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/binary-vector/vor -// family: binary-vector -// target_ops: pto.vor -// scenarios: core-i16-unsigned, full-mask -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vor_i16_unsigned_kernel(%arg0: !pto.ptr, %arg1: !pto.ptr, - %arg2: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1024 = arith.constant 1024 : index - %c128 = arith.constant 128 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c64_i64 = arith.constant 64 : i64 - %c2048_i64 = arith.constant 2048 : i64 - %c4096_i64 = arith.constant 4096 : i64 - - %ub_lhs = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_rhs = pto.castptr %c2048_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c4096_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_lhs, %c0_i64, %c64_i64 - nburst(%c32_i64, %c64_i64, %c64_i64) + func.func @vor_deep_merged_kernel(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: !pto.ptr, %arg4: !pto.ptr, %arg5: !pto.ptr, %arg6: !pto.ptr, %arg7: !pto.ptr, %arg8: !pto.ptr) attributes {pto.kernel} { + // merged from vor_i16_unsigned_kernel via LaunchVor_i16_unsigned_kernel + + %c0_m0 = arith.constant 0 : index + %c1024_m0 = arith.constant 1024 : index + %c128_m0 = arith.constant 128 : index + %c0_i64_m0 = arith.constant 0 : i64 + %c1_i64_m0 = arith.constant 1 : i64 + %c32_i64_m0 = arith.constant 32 : i64 + %c64_i64_m0 = arith.constant 64 : i64 + %c2048_i64_m0 = arith.constant 2048 : i64 + %c4096_i64_m0 = arith.constant 4096 : i64 + + %ub_lhs_m0 = pto.castptr %c0_i64_m0 : i64 -> !pto.ptr + %ub_rhs_m0 = pto.castptr %c2048_i64_m0 : i64 -> !pto.ptr + %ub_out_m0 = pto.castptr %c4096_i64_m0 : i64 -> !pto.ptr + + %false_m0 = arith.constant false + pto.mte_gm_ub %arg0, %ub_lhs_m0, %c0_i64_m0, %c64_i64_m0 + nburst(%c32_i64_m0, %c64_i64_m0, %c64_i64_m0) : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - pto.mte_gm_ub %arg1, %ub_rhs, %c0_i64, %c64_i64 - nburst(%c32_i64, %c64_i64, %c64_i64) + pto.mte_gm_ub %arg1, %ub_rhs_m0, %c0_i64_m0, %c64_i64_m0 + nburst(%c32_i64_m0, %c64_i64_m0, %c64_i64_m0) : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] pto.vecscope { - %mask = pto.pset_b16 "PAT_ALL" : !pto.mask - scf.for %offset = %c0 to %c1024 step %c128 { - %lhs = pto.vlds %ub_lhs[%offset] : !pto.ptr -> !pto.vreg<128xui16> - %rhs = pto.vlds %ub_rhs[%offset] : !pto.ptr -> !pto.vreg<128xui16> - %out = pto.vor %lhs, %rhs, %mask : !pto.vreg<128xui16>, !pto.vreg<128xui16>, !pto.mask -> !pto.vreg<128xui16> - pto.vsts %out, %ub_out[%offset], %mask : !pto.vreg<128xui16>, !pto.ptr, !pto.mask + %mask_m0 = pto.pset_b16 "PAT_ALL" : !pto.mask + scf.for %offset_m0 = %c0_m0 to %c1024_m0 step %c128_m0 { + %lhs_m0 = pto.vlds %ub_lhs_m0[%offset_m0] : !pto.ptr -> !pto.vreg<128xui16> + %rhs_m0 = pto.vlds %ub_rhs_m0[%offset_m0] : !pto.ptr -> !pto.vreg<128xui16> + %out_m0 = pto.vor %lhs_m0, %rhs_m0, %mask_m0 : !pto.vreg<128xui16>, !pto.vreg<128xui16>, !pto.mask -> !pto.vreg<128xui16> + pto.vsts %out_m0, %ub_out_m0[%offset_m0], %mask_m0 : !pto.vreg<128xui16>, !pto.ptr, !pto.mask } } pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg2, %c64_i64 - nburst(%c32_i64, %c64_i64, %c64_i64) + pto.mte_ub_gm %ub_out_m0, %arg2, %c64_i64_m0 + nburst(%c32_i64_m0, %c64_i64_m0, %c64_i64_m0) : !pto.ptr, !pto.ptr, i64, i64, i64, i64 pto.barrier #pto.pipe + + // merged from vor_f16_kernel via LaunchVor_f16_kernel + + %c0_m1 = arith.constant 0 : index + %c1024_m1 = arith.constant 1024 : index + %c128_m1 = arith.constant 128 : index + %c0_i64_m1 = arith.constant 0 : i64 + %c1_i64_m1 = arith.constant 1 : i64 + %c32_i64_m1 = arith.constant 32 : i64 + %c64_i64_m1 = arith.constant 64 : i64 + %c2048_i64_m1 = arith.constant 2048 : i64 + %c4096_i64_m1 = arith.constant 4096 : i64 + + %ub_lhs_m1 = pto.castptr %c0_i64_m1 : i64 -> !pto.ptr + %ub_rhs_m1 = pto.castptr %c2048_i64_m1 : i64 -> !pto.ptr + %ub_out_m1 = pto.castptr %c4096_i64_m1 : i64 -> !pto.ptr + + %false_m1 = arith.constant false + pto.mte_gm_ub %arg3, %ub_lhs_m1, %c0_i64_m1, %c64_i64_m1 + nburst(%c32_i64_m1, %c64_i64_m1, %c64_i64_m1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + pto.mte_gm_ub %arg4, %ub_rhs_m1, %c0_i64_m1, %c64_i64_m1 + nburst(%c32_i64_m1, %c64_i64_m1, %c64_i64_m1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %mask_m1 = pto.pset_b16 "PAT_ALL" : !pto.mask + scf.for %offset_m1 = %c0_m1 to %c1024_m1 step %c128_m1 { + %lhs_m1 = pto.vlds %ub_lhs_m1[%offset_m1] : !pto.ptr -> !pto.vreg<128xf16> + %rhs_m1 = pto.vlds %ub_rhs_m1[%offset_m1] : !pto.ptr -> !pto.vreg<128xf16> + %out_m1 = pto.vor %lhs_m1, %rhs_m1, %mask_m1 : !pto.vreg<128xf16>, !pto.vreg<128xf16>, !pto.mask -> !pto.vreg<128xf16> + pto.vsts %out_m1, %ub_out_m1[%offset_m1], %mask_m1 : !pto.vreg<128xf16>, !pto.ptr, !pto.mask + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m1, %arg5, %c64_i64_m1 + nburst(%c32_i64_m1, %c64_i64_m1, %c64_i64_m1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + + // merged from vor_mask_edge_kernel via LaunchVor_mask_edge_kernel + + %c0_m2 = arith.constant 0 : index + %c0_i64_m2 = arith.constant 0 : i64 + %c1_i64_m2 = arith.constant 1 : i64 + %c32_i64_m2 = arith.constant 32 : i64 + %c64_i64_m2 = arith.constant 64 : i64 + %c1024_m2 = arith.constant 1024 : index + %c128_m2 = arith.constant 128 : index + %c2048_i64_m2 = arith.constant 2048 : i64 + %c4096_i64_m2 = arith.constant 4096 : i64 + + %ub_lhs_m2 = pto.castptr %c0_i64_m2 : i64 -> !pto.ptr + %ub_rhs_m2 = pto.castptr %c2048_i64_m2 : i64 -> !pto.ptr + %ub_out_m2 = pto.castptr %c4096_i64_m2 : i64 -> !pto.ptr + + %false_m2 = arith.constant false + pto.mte_gm_ub %arg6, %ub_lhs_m2, %c0_i64_m2, %c64_i64_m2 + nburst(%c32_i64_m2, %c64_i64_m2, %c64_i64_m2) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + pto.mte_gm_ub %arg7, %ub_rhs_m2, %c0_i64_m2, %c64_i64_m2 + nburst(%c32_i64_m2, %c64_i64_m2, %c64_i64_m2) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %mask_m2 = pto.pset_b16 "PAT_ALL" : !pto.mask + scf.for %offset_m2 = %c0_m2 to %c1024_m2 step %c128_m2 { + %lhs_m2 = pto.vlds %ub_lhs_m2[%offset_m2] : !pto.ptr -> !pto.vreg<128xui16> + %rhs_m2 = pto.vlds %ub_rhs_m2[%offset_m2] : !pto.ptr -> !pto.vreg<128xui16> + %out_m2 = pto.vor %lhs_m2, %rhs_m2, %mask_m2 : !pto.vreg<128xui16>, !pto.vreg<128xui16>, !pto.mask -> !pto.vreg<128xui16> + pto.vsts %out_m2, %ub_out_m2[%offset_m2], %mask_m2 : !pto.vreg<128xui16>, !pto.ptr, !pto.mask + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m2, %arg8, %c64_i64_m2 + nburst(%c32_i64_m2, %c64_i64_m2, %c64_i64_m2) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + return } } diff --git a/test/vpto/cases/micro-op/binary-vector/vor/launch.cpp b/test/vpto/cases/micro-op/binary-vector/vor/launch.cpp index 416e5354f..4de3d4b30 100644 --- a/test/vpto/cases/micro-op/binary-vector/vor/launch.cpp +++ b/test/vpto/cases/micro-op/binary-vector/vor/launch.cpp @@ -5,19 +5,10 @@ // THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, // INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. // See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/binary-vector/vor -// family: binary-vector -// target_ops: pto.vor -// scenarios: core-i16-unsigned, full-mask -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- +// Merged launch wrappers #ifndef __VEC_SCOPE__ #define __VEC_SCOPE__ #endif - #if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) typedef struct { unsigned char v; } hifloat8_t; typedef struct { unsigned char v; } float8_e4m3_t; @@ -30,14 +21,8 @@ typedef struct { unsigned char v; } float4_e2m1x2_t; #if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) #include #endif - #if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; +struct MrgSortExecutedNumList { uint16_t mrgSortList0,mrgSortList1,mrgSortList2,mrgSortList3; }; #endif #ifndef __CPU_SIM #include "acl/acl.h" @@ -47,9 +32,26 @@ extern "C" __global__ [aicore] void vor_i16_unsigned_kernel(__gm__ uint16_t *v1, __gm__ uint16_t *v2, __gm__ uint16_t *v3); -void LaunchVor_i16_unsigned_kernel(uint16_t *v1, uint16_t *v2, uint16_t *v3, - void *stream) { - vor_i16_unsigned_kernel<<<1, nullptr, stream>>>((__gm__ uint16_t *)v1, - (__gm__ uint16_t *)v2, - (__gm__ uint16_t *)v3); +extern "C" __global__ [aicore] void vor_deep_merged_kernel( + __gm__ uint16_t * arg0, + __gm__ uint16_t * arg1, + __gm__ uint16_t * arg2, + __gm__ half * arg3, + __gm__ half * arg4, + __gm__ half * arg5, + __gm__ uint16_t * arg6, + __gm__ uint16_t * arg7, + __gm__ uint16_t * arg8); + +void LaunchVorDeepMerged(uint16_t * p0, uint16_t * p1, uint16_t * p2, uint16_t * p3, uint16_t * p4, uint16_t * p5, uint16_t * p6, uint16_t * p7, uint16_t * p8, void *stream) { + vor_deep_merged_kernel<<<1, nullptr, stream>>>( + (__gm__ uint16_t *)p0, + (__gm__ uint16_t *)p1, + (__gm__ uint16_t *)p2, + (__gm__ half *)p3, + (__gm__ half *)p4, + (__gm__ half *)p5, + (__gm__ uint16_t *)p6, + (__gm__ uint16_t *)p7, + (__gm__ uint16_t *)p8); } diff --git a/test/vpto/cases/micro-op/binary-vector/vor/main.cpp b/test/vpto/cases/micro-op/binary-vector/vor/main.cpp index 0ebb0d781..013cb81c1 100644 --- a/test/vpto/cases/micro-op/binary-vector/vor/main.cpp +++ b/test/vpto/cases/micro-op/binary-vector/vor/main.cpp @@ -5,99 +5,118 @@ // THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, // INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. // See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/binary-vector/vor -// family: binary-vector -// target_ops: pto.vor -// scenarios: core-i16-unsigned, full-mask -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -*/ +// Merged vor test case. #include "test_common.h" #include "acl/acl.h" #include #include - +#include using namespace PtoTestCommon; -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, \ - (int)_ret, __FILE__, __LINE__); \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) +#define ACL_CHECK(expr) do { const aclError _r=(expr); if(_r!=ACL_SUCCESS){std::fprintf(stderr,"[ERROR] %s:%d acle=%d\n",#expr,__LINE__,(int)_r);rc=1;goto cleanup;} }while(0) +#define FCK(expr,path) do{if(!(expr)){std::fprintf(stderr,"[ERROR] file:%s\n",path);rc=1;goto cleanup;}}while(0) + + -void LaunchVor_i16_unsigned_kernel(uint16_t *v1, uint16_t *v2, uint16_t *v3, - void *stream); +void LaunchVorDeepMerged(uint16_t * p0, uint16_t * p1, uint16_t * p2, uint16_t * p3, uint16_t * p4, uint16_t * p5, uint16_t * p6, uint16_t * p7, uint16_t * p8, void *stream); int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(uint16_t); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(uint16_t); - size_t elemCount_v3 = 1024; - size_t fileSize_v3 = elemCount_v3 * sizeof(uint16_t); - uint16_t *v1Host = nullptr; - uint16_t *v1Device = nullptr; - uint16_t *v2Host = nullptr; - uint16_t *v2Device = nullptr; - uint16_t *v3Host = nullptr; - uint16_t *v3Device = nullptr; - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; + constexpr size_t SZ_f32 = 2048; + constexpr size_t SZ_f16 = 2048; + constexpr size_t SZ_mask_edge = 2048; - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) - deviceId = std::atoi(envDevice); - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; + uint16_t *h_f32_v1=nullptr, *d_f32_v1=nullptr; + uint16_t *h_f32_v2=nullptr, *d_f32_v2=nullptr; + uint16_t *h_f32_v3=nullptr, *d_f32_v3=nullptr; + uint16_t *h_f16_v1=nullptr, *d_f16_v1=nullptr; + uint16_t *h_f16_v2=nullptr, *d_f16_v2=nullptr; + uint16_t *h_f16_v3=nullptr, *d_f16_v3=nullptr; + uint16_t *h_mask_edge_v1=nullptr, *d_mask_edge_v1=nullptr; + uint16_t *h_mask_edge_v2=nullptr, *d_mask_edge_v2=nullptr; + uint16_t *h_mask_edge_v3=nullptr, *d_mask_edge_v3=nullptr; + int rc=0; bool aclInited=false,deviceSet=false; int deviceId=0; aclrtStream stream=nullptr; size_t fsz=0; + ACL_CHECK(aclInit(nullptr)); aclInited=true; + if(const char*e=std::getenv("ACL_DEVICE_ID")) deviceId=std::atoi(e); + ACL_CHECK(aclrtSetDevice(deviceId)); deviceSet=true; ACL_CHECK(aclrtCreateStream(&stream)); - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMallocHost((void **)(&v3Host), fileSize_v3)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v3Device, fileSize_v3, ACL_MEM_MALLOC_HUGE_FIRST)); - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ReadFile("./v3.bin", fileSize_v3, v3Host, fileSize_v3); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v3Device, fileSize_v3, v3Host, fileSize_v3, - ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVor_i16_unsigned_kernel(v1Device, v2Device, v3Device, stream); + ACL_CHECK(aclrtMallocHost((void**)&h_f32_v1,SZ_f32)); + ACL_CHECK(aclrtMallocHost((void**)&h_f32_v2,SZ_f32)); + ACL_CHECK(aclrtMallocHost((void**)&h_f32_v3,SZ_f32)); + ACL_CHECK(aclrtMallocHost((void**)&h_f16_v1,SZ_f16)); + ACL_CHECK(aclrtMallocHost((void**)&h_f16_v2,SZ_f16)); + ACL_CHECK(aclrtMallocHost((void**)&h_f16_v3,SZ_f16)); + ACL_CHECK(aclrtMallocHost((void**)&h_mask_edge_v1,SZ_mask_edge)); + ACL_CHECK(aclrtMallocHost((void**)&h_mask_edge_v2,SZ_mask_edge)); + ACL_CHECK(aclrtMallocHost((void**)&h_mask_edge_v3,SZ_mask_edge)); + ACL_CHECK(aclrtMalloc((void**)&d_f32_v1,SZ_f32,ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc((void**)&d_f32_v2,SZ_f32,ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc((void**)&d_f32_v3,SZ_f32,ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc((void**)&d_f16_v1,SZ_f16,ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc((void**)&d_f16_v2,SZ_f16,ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc((void**)&d_f16_v3,SZ_f16,ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc((void**)&d_mask_edge_v1,SZ_mask_edge,ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc((void**)&d_mask_edge_v2,SZ_mask_edge,ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc((void**)&d_mask_edge_v3,SZ_mask_edge,ACL_MEM_MALLOC_HUGE_FIRST)); + fsz=SZ_f32; FCK(ReadFile("v1.bin",fsz,h_f32_v1,SZ_f32)&&fsz==SZ_f32,"v1.bin"); + fsz=SZ_f32; FCK(ReadFile("v2.bin",fsz,h_f32_v2,SZ_f32)&&fsz==SZ_f32,"v2.bin"); + fsz=SZ_f32; FCK(ReadFile("v3.bin",fsz,h_f32_v3,SZ_f32)&&fsz==SZ_f32,"v3.bin"); + fsz=SZ_f16; FCK(ReadFile("v1_f16.bin",fsz,h_f16_v1,SZ_f16)&&fsz==SZ_f16,"v1_f16.bin"); + fsz=SZ_f16; FCK(ReadFile("v2_f16.bin",fsz,h_f16_v2,SZ_f16)&&fsz==SZ_f16,"v2_f16.bin"); + fsz=SZ_f16; FCK(ReadFile("v3_f16.bin",fsz,h_f16_v3,SZ_f16)&&fsz==SZ_f16,"v3_f16.bin"); + fsz=SZ_mask_edge; FCK(ReadFile("v1_mask_edge.bin",fsz,h_mask_edge_v1,SZ_mask_edge)&&fsz==SZ_mask_edge,"v1_mask_edge.bin"); + fsz=SZ_mask_edge; FCK(ReadFile("v2_mask_edge.bin",fsz,h_mask_edge_v2,SZ_mask_edge)&&fsz==SZ_mask_edge,"v2_mask_edge.bin"); + fsz=SZ_mask_edge; FCK(ReadFile("v3_mask_edge.bin",fsz,h_mask_edge_v3,SZ_mask_edge)&&fsz==SZ_mask_edge,"v3_mask_edge.bin"); + ACL_CHECK(aclrtMemcpy(d_f32_v1,SZ_f32,h_f32_v1,SZ_f32,ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(d_f32_v2,SZ_f32,h_f32_v2,SZ_f32,ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(d_f32_v3,SZ_f32,h_f32_v3,SZ_f32,ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(d_f16_v1,SZ_f16,h_f16_v1,SZ_f16,ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(d_f16_v2,SZ_f16,h_f16_v2,SZ_f16,ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(d_f16_v3,SZ_f16,h_f16_v3,SZ_f16,ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(d_mask_edge_v1,SZ_mask_edge,h_mask_edge_v1,SZ_mask_edge,ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(d_mask_edge_v2,SZ_mask_edge,h_mask_edge_v2,SZ_mask_edge,ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(d_mask_edge_v3,SZ_mask_edge,h_mask_edge_v3,SZ_mask_edge,ACL_MEMCPY_HOST_TO_DEVICE)); + LaunchVorDeepMerged( + d_f32_v1, + d_f32_v2, + d_f32_v3, + d_f16_v1, + d_f16_v2, + d_f16_v3, + d_mask_edge_v1, + d_mask_edge_v2, + d_mask_edge_v3, + stream + ); ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v3Host, fileSize_v3, v3Device, fileSize_v3, - ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v3.bin", v3Host, fileSize_v3); + ACL_CHECK(aclrtMemcpy(h_f32_v3,SZ_f32,d_f32_v3,SZ_f32,ACL_MEMCPY_DEVICE_TO_HOST)); + ACL_CHECK(aclrtMemcpy(h_f16_v3,SZ_f16,d_f16_v3,SZ_f16,ACL_MEMCPY_DEVICE_TO_HOST)); + ACL_CHECK(aclrtMemcpy(h_mask_edge_v3,SZ_mask_edge,d_mask_edge_v3,SZ_mask_edge,ACL_MEMCPY_DEVICE_TO_HOST)); + FCK(WriteFile("v3.bin",h_f32_v3,SZ_f32),"v3.bin"); + FCK(WriteFile("v3_f16.bin",h_f16_v3,SZ_f16),"v3_f16.bin"); + FCK(WriteFile("v3_mask_edge.bin",h_mask_edge_v3,SZ_mask_edge),"v3_mask_edge.bin"); cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFree(v3Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - aclrtFreeHost(v3Host); - if (stream != nullptr) - aclrtDestroyStream(stream); - if (deviceSet) - aclrtResetDevice(deviceId); - if (aclInited) - aclFinalize(); + aclrtFree(d_f32_v1); + aclrtFree(d_f32_v2); + aclrtFree(d_f32_v3); + aclrtFree(d_f16_v1); + aclrtFree(d_f16_v2); + aclrtFree(d_f16_v3); + aclrtFree(d_mask_edge_v1); + aclrtFree(d_mask_edge_v2); + aclrtFree(d_mask_edge_v3); + aclrtFreeHost(h_f32_v1); + aclrtFreeHost(h_f32_v2); + aclrtFreeHost(h_f32_v3); + aclrtFreeHost(h_f16_v1); + aclrtFreeHost(h_f16_v2); + aclrtFreeHost(h_f16_v3); + aclrtFreeHost(h_mask_edge_v1); + aclrtFreeHost(h_mask_edge_v2); + aclrtFreeHost(h_mask_edge_v3); + if(stream) aclrtDestroyStream(stream); + if(deviceSet) aclrtResetDevice(deviceId); + if(aclInited) aclFinalize(); return rc; } diff --git a/test/vpto/cases/micro-op/binary-vector/vshl-i32-unsigned/compare.py b/test/vpto/cases/micro-op/binary-vector/vshl-i32-unsigned/compare.py deleted file mode 100755 index fcf304f6f..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vshl-i32-unsigned/compare.py +++ /dev/null @@ -1,44 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/binary-vector/vshl-i32-unsigned -# family: binary-vector -# target_ops: pto.vshl -# scenarios: core-i32-unsigned, full-mask -# NOTE: bulk-generated coverage skeleton. - -import os -import sys -import numpy as np - - -def compare_bin_prefix(golden_path, output_path, dtype, eps, count): - if not os.path.exists(golden_path) or not os.path.exists(output_path): - return False - golden = np.fromfile(golden_path, dtype=dtype, count=count) - output = np.fromfile(output_path, dtype=dtype, count=count) - return golden.shape == output.shape and np.allclose( - golden, output, atol=eps, rtol=eps, equal_nan=True - ) - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_bin_prefix("golden_v3.bin", "v3.bin", np.uint32, 0, 1024) - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/binary-vector/vshl-i32-unsigned/golden.py b/test/vpto/cases/micro-op/binary-vector/vshl-i32-unsigned/golden.py deleted file mode 100755 index cefd36ee1..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vshl-i32-unsigned/golden.py +++ /dev/null @@ -1,50 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/binary-vector/vshl-i32-unsigned -# family: binary-vector -# target_ops: pto.vshl -# scenarios: core-i32-unsigned, full-mask -# NOTE: bulk-generated coverage skeleton. -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -ELEMS = 1024 -SEED = 19 - - -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) - v1 = rng.integers(0, 1 << 32, size=ELEMS, dtype=np.uint32) - v2 = rng.integers(0, 32, size=ELEMS, dtype=np.uint32) - v3 = np.zeros(ELEMS, dtype=np.uint32) - golden_v3 = np.left_shift(v1, v2 & np.uint32(31)).astype(np.uint32, copy=False) - - output_dir.mkdir(parents=True, exist_ok=True) - v1.tofile(output_dir / "v1.bin") - v2.tofile(output_dir / "v2.bin") - v3.tofile(output_dir / "v3.bin") - golden_v3.tofile(output_dir / "golden_v3.bin") - - -def main() -> None: - parser = argparse.ArgumentParser() - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/binary-vector/vshl-i32-unsigned/kernel.pto b/test/vpto/cases/micro-op/binary-vector/vshl-i32-unsigned/kernel.pto deleted file mode 100644 index 50ef19cc1..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vshl-i32-unsigned/kernel.pto +++ /dev/null @@ -1,57 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/binary-vector/vshl-i32-unsigned -// family: binary-vector -// target_ops: pto.vshl -// scenarios: core-i32-unsigned, full-mask -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vshl_i32_unsigned_kernel(%arg0: !pto.ptr, %arg1: !pto.ptr, - %arg2: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c64_i64 = arith.constant 64 : i64 - %c128_i64 = arith.constant 128 : i64 - %c64 = arith.constant 64 : index - %c1024 = arith.constant 1024 : index - %c128 = arith.constant 128 : index - %c4096_i64 = arith.constant 4096 : i64 - %c8192_i64 = arith.constant 8192 : i64 - - %ub_lhs = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_rhs = pto.castptr %c4096_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c8192_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_lhs, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - pto.mte_gm_ub %arg1, %ub_rhs, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %mask = pto.pset_b32 "PAT_ALL" : !pto.mask - scf.for %offset = %c0 to %c1024 step %c64 { - %lhs = pto.vlds %ub_lhs[%offset] : !pto.ptr -> !pto.vreg<64xui32> - %rhs = pto.vlds %ub_rhs[%offset] : !pto.ptr -> !pto.vreg<64xui32> - %out = pto.vshl %lhs, %rhs, %mask : !pto.vreg<64xui32>, !pto.vreg<64xui32>, !pto.mask -> !pto.vreg<64xui32> - pto.vsts %out, %ub_out[%offset], %mask : !pto.vreg<64xui32>, !pto.ptr, !pto.mask - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg2, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/binary-vector/vshl-i32-unsigned/launch.cpp b/test/vpto/cases/micro-op/binary-vector/vshl-i32-unsigned/launch.cpp deleted file mode 100644 index ba6ef9dca..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vshl-i32-unsigned/launch.cpp +++ /dev/null @@ -1,55 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/binary-vector/vshl-i32-unsigned -// family: binary-vector -// target_ops: pto.vshl -// scenarios: core-i32-unsigned, full-mask -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vshl_i32_unsigned_kernel(__gm__ uint32_t *v1, - __gm__ uint32_t *v2, - __gm__ uint32_t *v3); - -void LaunchVshl_i32_unsigned_kernel(uint32_t *v1, uint32_t *v2, uint32_t *v3, - void *stream) { - vshl_i32_unsigned_kernel<<<1, nullptr, stream>>>((__gm__ uint32_t *)v1, - (__gm__ uint32_t *)v2, - (__gm__ uint32_t *)v3); -} diff --git a/test/vpto/cases/micro-op/binary-vector/vshl-i32-unsigned/main.cpp b/test/vpto/cases/micro-op/binary-vector/vshl-i32-unsigned/main.cpp deleted file mode 100644 index df4bc2e9a..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vshl-i32-unsigned/main.cpp +++ /dev/null @@ -1,103 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/binary-vector/vshl-i32-unsigned -// family: binary-vector -// target_ops: pto.vshl -// scenarios: core-i32-unsigned, full-mask -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include - -using namespace PtoTestCommon; - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, \ - (int)_ret, __FILE__, __LINE__); \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVshl_i32_unsigned_kernel(uint32_t *v1, uint32_t *v2, uint32_t *v3, - void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(uint32_t); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(uint32_t); - size_t elemCount_v3 = 1024; - size_t fileSize_v3 = elemCount_v3 * sizeof(uint32_t); - uint32_t *v1Host = nullptr; - uint32_t *v1Device = nullptr; - uint32_t *v2Host = nullptr; - uint32_t *v2Device = nullptr; - uint32_t *v3Host = nullptr; - uint32_t *v3Device = nullptr; - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) - deviceId = std::atoi(envDevice); - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMallocHost((void **)(&v3Host), fileSize_v3)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v3Device, fileSize_v3, ACL_MEM_MALLOC_HUGE_FIRST)); - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ReadFile("./v3.bin", fileSize_v3, v3Host, fileSize_v3); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v3Device, fileSize_v3, v3Host, fileSize_v3, - ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVshl_i32_unsigned_kernel(v1Device, v2Device, v3Device, stream); - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v3Host, fileSize_v3, v3Device, fileSize_v3, - ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v3.bin", v3Host, fileSize_v3); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFree(v3Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - aclrtFreeHost(v3Host); - if (stream != nullptr) - aclrtDestroyStream(stream); - if (deviceSet) - aclrtResetDevice(deviceId); - if (aclInited) - aclFinalize(); - return rc; -} diff --git a/test/vpto/cases/micro-op/binary-vector/vshl-shift-boundary/compare.py b/test/vpto/cases/micro-op/binary-vector/vshl-shift-boundary/compare.py deleted file mode 100755 index 2ef28c2cf..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vshl-shift-boundary/compare.py +++ /dev/null @@ -1,44 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/binary-vector/vshl-shift-boundary -# family: binary-vector -# target_ops: pto.vshl -# scenarios: core-i16-unsigned, full-mask -# NOTE: bulk-generated coverage skeleton. - -import os -import sys -import numpy as np - - -def compare_bin_prefix(golden_path, output_path, dtype, eps, count): - if not os.path.exists(golden_path) or not os.path.exists(output_path): - return False - golden = np.fromfile(golden_path, dtype=dtype, count=count) - output = np.fromfile(output_path, dtype=dtype, count=count) - return golden.shape == output.shape and np.allclose( - golden, output, atol=eps, rtol=eps, equal_nan=True - ) - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_bin_prefix("golden_v3.bin", "v3.bin", np.uint16, 0, 1024) - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/binary-vector/vshl-shift-boundary/golden.py b/test/vpto/cases/micro-op/binary-vector/vshl-shift-boundary/golden.py deleted file mode 100755 index 15261e271..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vshl-shift-boundary/golden.py +++ /dev/null @@ -1,51 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/binary-vector/vshl-shift-boundary -# family: binary-vector -# target_ops: pto.vshl -# scenarios: core-i16-unsigned, full-mask -# NOTE: bulk-generated coverage skeleton. -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -ELEMS = 1024 -SEED = 19 - - -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) - v1 = rng.integers(0, 1 << 16, size=ELEMS, dtype=np.uint16) - shift_cycle = np.array([0, 1, 14, 15, 15, 14, 1, 0], dtype=np.uint16) - v2 = np.resize(shift_cycle, ELEMS).astype(np.uint16, copy=False) - v3 = np.zeros(ELEMS, dtype=np.uint16) - golden_v3 = np.left_shift(v1, v2 & np.uint16(15)).astype(np.uint16, copy=False) - - output_dir.mkdir(parents=True, exist_ok=True) - v1.tofile(output_dir / "v1.bin") - v2.tofile(output_dir / "v2.bin") - v3.tofile(output_dir / "v3.bin") - golden_v3.tofile(output_dir / "golden_v3.bin") - - -def main() -> None: - parser = argparse.ArgumentParser() - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/binary-vector/vshl-shift-boundary/kernel.pto b/test/vpto/cases/micro-op/binary-vector/vshl-shift-boundary/kernel.pto deleted file mode 100644 index d10c6a05b..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vshl-shift-boundary/kernel.pto +++ /dev/null @@ -1,55 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/binary-vector/vshl-shift-boundary -// family: binary-vector -// target_ops: pto.vshl -// scenarios: core-i16-unsigned, full-mask -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vshl_shift_boundary_kernel(%arg0: !pto.ptr, %arg1: !pto.ptr, - %arg2: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c64_i64 = arith.constant 64 : i64 - %c1024 = arith.constant 1024 : index - %c128 = arith.constant 128 : index - %c2048_i64 = arith.constant 2048 : i64 - %c4096_i64 = arith.constant 4096 : i64 - - %ub_lhs = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_rhs = pto.castptr %c2048_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c4096_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_lhs, %c0_i64, %c64_i64 - nburst(%c32_i64, %c64_i64, %c64_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - pto.mte_gm_ub %arg1, %ub_rhs, %c0_i64, %c64_i64 - nburst(%c32_i64, %c64_i64, %c64_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %mask = pto.pset_b16 "PAT_ALL" : !pto.mask - scf.for %offset = %c0 to %c1024 step %c128 { - %lhs = pto.vlds %ub_lhs[%offset] : !pto.ptr -> !pto.vreg<128xui16> - %rhs = pto.vlds %ub_rhs[%offset] : !pto.ptr -> !pto.vreg<128xui16> - %out = pto.vshl %lhs, %rhs, %mask : !pto.vreg<128xui16>, !pto.vreg<128xui16>, !pto.mask -> !pto.vreg<128xui16> - pto.vsts %out, %ub_out[%offset], %mask : !pto.vreg<128xui16>, !pto.ptr, !pto.mask - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg2, %c64_i64 - nburst(%c32_i64, %c64_i64, %c64_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/binary-vector/vshl-shift-boundary/launch.cpp b/test/vpto/cases/micro-op/binary-vector/vshl-shift-boundary/launch.cpp deleted file mode 100644 index bbf9c75f5..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vshl-shift-boundary/launch.cpp +++ /dev/null @@ -1,55 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/binary-vector/vshl-shift-boundary -// family: binary-vector -// target_ops: pto.vshl -// scenarios: core-i16-unsigned, full-mask -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vshl_shift_boundary_kernel(__gm__ uint16_t *v1, - __gm__ uint16_t *v2, - __gm__ uint16_t *v3); - -void LaunchVshl_shift_boundary_kernel(uint16_t *v1, uint16_t *v2, uint16_t *v3, - void *stream) { - vshl_shift_boundary_kernel<<<1, nullptr, stream>>>((__gm__ uint16_t *)v1, - (__gm__ uint16_t *)v2, - (__gm__ uint16_t *)v3); -} diff --git a/test/vpto/cases/micro-op/binary-vector/vshl-shift-boundary/main.cpp b/test/vpto/cases/micro-op/binary-vector/vshl-shift-boundary/main.cpp deleted file mode 100644 index 13d114f38..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vshl-shift-boundary/main.cpp +++ /dev/null @@ -1,103 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/binary-vector/vshl-shift-boundary -// family: binary-vector -// target_ops: pto.vshl -// scenarios: core-i16-unsigned, full-mask -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include - -using namespace PtoTestCommon; - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, \ - (int)_ret, __FILE__, __LINE__); \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVshl_shift_boundary_kernel(uint16_t *v1, uint16_t *v2, uint16_t *v3, - void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(uint16_t); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(uint16_t); - size_t elemCount_v3 = 1024; - size_t fileSize_v3 = elemCount_v3 * sizeof(uint16_t); - uint16_t *v1Host = nullptr; - uint16_t *v1Device = nullptr; - uint16_t *v2Host = nullptr; - uint16_t *v2Device = nullptr; - uint16_t *v3Host = nullptr; - uint16_t *v3Device = nullptr; - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) - deviceId = std::atoi(envDevice); - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMallocHost((void **)(&v3Host), fileSize_v3)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v3Device, fileSize_v3, ACL_MEM_MALLOC_HUGE_FIRST)); - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ReadFile("./v3.bin", fileSize_v3, v3Host, fileSize_v3); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v3Device, fileSize_v3, v3Host, fileSize_v3, - ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVshl_shift_boundary_kernel(v1Device, v2Device, v3Device, stream); - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v3Host, fileSize_v3, v3Device, fileSize_v3, - ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v3.bin", v3Host, fileSize_v3); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFree(v3Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - aclrtFreeHost(v3Host); - if (stream != nullptr) - aclrtDestroyStream(stream); - if (deviceSet) - aclrtResetDevice(deviceId); - if (aclInited) - aclFinalize(); - return rc; -} diff --git a/test/vpto/cases/micro-op/binary-vector/vshl/compare.py b/test/vpto/cases/micro-op/binary-vector/vshl/compare.py index 7006bca77..16d3d5566 100755 --- a/test/vpto/cases/micro-op/binary-vector/vshl/compare.py +++ b/test/vpto/cases/micro-op/binary-vector/vshl/compare.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python3 +#!/usr/bin/python3 # Copyright (c) 2026 Huawei Technologies Co., Ltd. # This program is free software, you can redistribute it and/or modify it under the terms and conditions of # CANN Open Software License Agreement Version 2.0 (the "License"). @@ -7,38 +7,47 @@ # INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. # See LICENSE in the root of the software repository for the full text of the License. -# case: micro-op/binary-vector/vshl -# family: binary-vector -# target_ops: pto.vshl -# scenarios: core-i16-unsigned, full-mask -# NOTE: bulk-generated coverage skeleton. +# Merged vshl test case. -import os -import sys +import os,sys import numpy as np +def _cmp(golden,output,dtype,eps,count=-1): + if not os.path.exists(golden) or not os.path.exists(output): return False + kw={} if count<0 else {"count":count} + g=np.fromfile(golden,dtype=dtype,**kw) + o=np.fromfile(output,dtype=dtype,**kw) + return g.shape==o.shape and np.allclose(g,o,atol=eps,rtol=eps,equal_nan=True) -def compare_bin_prefix(golden_path, output_path, dtype, eps, count): - if not os.path.exists(golden_path) or not os.path.exists(output_path): - return False - golden = np.fromfile(golden_path, dtype=dtype, count=count) - output = np.fromfile(output_path, dtype=dtype, count=count) - return golden.shape == output.shape and np.allclose( - golden, output, atol=eps, rtol=eps, equal_nan=True - ) - +def _cmpeq(golden,output,dtype): + if not os.path.exists(golden) or not os.path.exists(output): return False + g=np.fromfile(golden,dtype=dtype) + o=np.fromfile(output,dtype=dtype) + return g.shape==o.shape and np.array_equal(g,o) def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_bin_prefix("golden_v3.bin", "v3.bin", np.uint16, 0, 1024) - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") + strict=os.getenv('COMPARE_STRICT','1')!='0' + failed=[] + if not (_cmp("golden_v3.bin","v3.bin",np.uint16,0,1024)): + failed.append('f32') + print('[ERROR] compare failed: f32') + else: + print('[INFO] f32: passed') + if not (_cmp("golden_v3_i32_unsigned.bin","v3_i32_unsigned.bin",np.uint32,0,1024)): + failed.append('i32_unsigned') + print('[ERROR] compare failed: i32_unsigned') + else: + print('[INFO] i32_unsigned: passed') + if not (_cmpeq("golden_v3_shift_boundary.bin","v3_shift_boundary.bin",np.uint16)): + failed.append('shift_boundary') + print('[ERROR] compare failed: shift_boundary') + else: + print('[INFO] shift_boundary: passed') + if failed: + if strict: print(f"[ERROR] {len(failed)} variant(s) failed"); sys.exit(2) + print(f"[WARN] {len(failed)} variant(s) failed (non-gating)") return - print("[INFO] compare passed") - + print("[INFO] compare passed (all 3 variants)") -if __name__ == "__main__": +if __name__=="__main__": main() diff --git a/test/vpto/cases/micro-op/binary-vector/vshl/golden.py b/test/vpto/cases/micro-op/binary-vector/vshl/golden.py index ed6ca93bc..27dc00d6b 100755 --- a/test/vpto/cases/micro-op/binary-vector/vshl/golden.py +++ b/test/vpto/cases/micro-op/binary-vector/vshl/golden.py @@ -7,44 +7,78 @@ # INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. # See LICENSE in the root of the software repository for the full text of the License. -# case: micro-op/binary-vector/vshl -# family: binary-vector -# target_ops: pto.vshl -# scenarios: core-i16-unsigned, full-mask -# NOTE: bulk-generated coverage skeleton. -# coding=utf-8 +# Merged vshl test case. import argparse from pathlib import Path - import numpy as np - -ELEMS = 1024 +ROWS = 32 +COLS = 32 SEED = 19 +LOGICAL_ELEMS = 1000 +OUT_SENTINEL = np.float32(-123.25) + +def f32_to_bf16_bits(v): + w=v.astype(np.float32,copy=False).view(np.uint32) + r=np.uint32(0x7FFF)+((w>>16)&np.uint32(1)) + return ((w+r)>>16).astype(np.uint16) +def bf16_bits_to_f32(b): + return (b.astype(np.uint32)<<16).view(np.float32) -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) - v1 = rng.integers(0, 0x10000, size=ELEMS, dtype=np.uint16) - v2 = rng.integers(0, 16, size=ELEMS, dtype=np.uint16) - v3 = np.zeros(ELEMS, dtype=np.uint16) - golden_v3 = np.left_shift(v1, v2 & np.uint16(15)).astype(np.uint16, copy=False) +# ---- f32 ---- +def gen_f32(out, rng): + v1=rng.integers(0,0x10000,size=(ROWS,COLS),dtype=np.uint16) + v2=rng.integers(0,16,size=(ROWS,COLS),dtype=np.uint16) + g=np.left_shift(v1.astype(np.uint16),v2.astype(np.uint16)) + v3=np.zeros((ROWS,COLS),dtype=np.uint16) + v1.reshape(-1).tofile(out/"v1.bin") + v2.reshape(-1).tofile(out/"v2.bin") + v3.reshape(-1).tofile(out/"v3.bin") + g.reshape(-1).tofile(out/"golden_v3.bin") - output_dir.mkdir(parents=True, exist_ok=True) - v1.tofile(output_dir / "v1.bin") - v2.tofile(output_dir / "v2.bin") - v3.tofile(output_dir / "v3.bin") - golden_v3.tofile(output_dir / "golden_v3.bin") +# ---- i32_unsigned ---- +def gen_i32_unsigned(out, rng): + v1=rng.integers(0,0x10000,size=(ROWS,COLS),dtype=np.uint32) + v2=rng.integers(0,32,size=(ROWS,COLS),dtype=np.uint32) + g=np.left_shift(v1.astype(np.uint32),v2.astype(np.uint32)) + v3=np.zeros((ROWS,COLS),dtype=np.uint32) + v1.reshape(-1).tofile(out/"v1_i32_unsigned.bin") + v2.reshape(-1).tofile(out/"v2_i32_unsigned.bin") + v3.reshape(-1).tofile(out/"v3_i32_unsigned.bin") + g.reshape(-1).tofile(out/"golden_v3_i32_unsigned.bin") +# ---- shift_boundary ---- +def gen_shift_boundary(out, rng): + elems=ROWS*COLS + lhs_pat=np.array([0,1,15,16,255,256,4095,4096,32767,32768,65535,0],dtype=np.uint16) + rhs_pat=np.array([0,1,1,1,2,2,3,3,4,5,6,15],dtype=np.uint16) + reps=elems//lhs_pat.size + v1=np.resize(lhs_pat,elems) + v2=np.resize(rhs_pat,elems) + g=np.left_shift(v1.astype(np.uint16),v2.astype(np.uint16)) + v3=np.zeros(elems,dtype=np.uint16) + v1.tofile(out/"v1_shift_boundary.bin") + v2.tofile(out/"v2_shift_boundary.bin") + v3.tofile(out/"v3_shift_boundary.bin") + g.tofile(out/"golden_v3_shift_boundary.bin") -def main() -> None: - parser = argparse.ArgumentParser() - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) +GENERATORS = [ + gen_f32, + gen_i32_unsigned, + gen_shift_boundary, +] +def main(): + p=argparse.ArgumentParser() + p.add_argument("--output-dir",type=Path,default=Path(".")) + p.add_argument("--seed",type=int,default=SEED) + a=p.parse_args() + rng=np.random.default_rng(a.seed) + out=a.output_dir; out.mkdir(parents=True,exist_ok=True) + for gen in GENERATORS: + gen(out,rng) -if __name__ == "__main__": +if __name__=="__main__": main() diff --git a/test/vpto/cases/micro-op/binary-vector/vshl/kernel.pto b/test/vpto/cases/micro-op/binary-vector/vshl/kernel.pto index ebe4448bd..cdd57ddf1 100644 --- a/test/vpto/cases/micro-op/binary-vector/vshl/kernel.pto +++ b/test/vpto/cases/micro-op/binary-vector/vshl/kernel.pto @@ -1,55 +1,139 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/binary-vector/vshl -// family: binary-vector -// target_ops: pto.vshl -// scenarios: core-i16-unsigned, full-mask -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vshl_i16_unsigned_kernel(%arg0: !pto.ptr, %arg1: !pto.ptr, - %arg2: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c64_i64 = arith.constant 64 : i64 - %c1024 = arith.constant 1024 : index - %c128 = arith.constant 128 : index - %c2048_i64 = arith.constant 2048 : i64 - %c4096_i64 = arith.constant 4096 : i64 - - %ub_lhs = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_rhs = pto.castptr %c2048_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c4096_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_lhs, %c0_i64, %c64_i64 - nburst(%c32_i64, %c64_i64, %c64_i64) + func.func @vshl_deep_merged_kernel(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: !pto.ptr, %arg4: !pto.ptr, %arg5: !pto.ptr, %arg6: !pto.ptr, %arg7: !pto.ptr, %arg8: !pto.ptr) attributes {pto.kernel} { + // merged from vshl_i16_unsigned_kernel via LaunchVshl_i16_unsigned_kernel + + %c0_m0 = arith.constant 0 : index + %c0_i64_m0 = arith.constant 0 : i64 + %c1_i64_m0 = arith.constant 1 : i64 + %c32_i64_m0 = arith.constant 32 : i64 + %c64_i64_m0 = arith.constant 64 : i64 + %c1024_m0 = arith.constant 1024 : index + %c128_m0 = arith.constant 128 : index + %c2048_i64_m0 = arith.constant 2048 : i64 + %c4096_i64_m0 = arith.constant 4096 : i64 + + %ub_lhs_m0 = pto.castptr %c0_i64_m0 : i64 -> !pto.ptr + %ub_rhs_m0 = pto.castptr %c2048_i64_m0 : i64 -> !pto.ptr + %ub_out_m0 = pto.castptr %c4096_i64_m0 : i64 -> !pto.ptr + + %false_m0 = arith.constant false + pto.mte_gm_ub %arg0, %ub_lhs_m0, %c0_i64_m0, %c64_i64_m0 + nburst(%c32_i64_m0, %c64_i64_m0, %c64_i64_m0) : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - pto.mte_gm_ub %arg1, %ub_rhs, %c0_i64, %c64_i64 - nburst(%c32_i64, %c64_i64, %c64_i64) + pto.mte_gm_ub %arg1, %ub_rhs_m0, %c0_i64_m0, %c64_i64_m0 + nburst(%c32_i64_m0, %c64_i64_m0, %c64_i64_m0) : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] pto.vecscope { - %mask = pto.pset_b16 "PAT_ALL" : !pto.mask - scf.for %offset = %c0 to %c1024 step %c128 { - %lhs = pto.vlds %ub_lhs[%offset] : !pto.ptr -> !pto.vreg<128xui16> - %rhs = pto.vlds %ub_rhs[%offset] : !pto.ptr -> !pto.vreg<128xui16> - %out = pto.vshl %lhs, %rhs, %mask : !pto.vreg<128xui16>, !pto.vreg<128xui16>, !pto.mask -> !pto.vreg<128xui16> - pto.vsts %out, %ub_out[%offset], %mask : !pto.vreg<128xui16>, !pto.ptr, !pto.mask + %mask_m0 = pto.pset_b16 "PAT_ALL" : !pto.mask + scf.for %offset_m0 = %c0_m0 to %c1024_m0 step %c128_m0 { + %lhs_m0 = pto.vlds %ub_lhs_m0[%offset_m0] : !pto.ptr -> !pto.vreg<128xui16> + %rhs_m0 = pto.vlds %ub_rhs_m0[%offset_m0] : !pto.ptr -> !pto.vreg<128xui16> + %out_m0 = pto.vshl %lhs_m0, %rhs_m0, %mask_m0 : !pto.vreg<128xui16>, !pto.vreg<128xui16>, !pto.mask -> !pto.vreg<128xui16> + pto.vsts %out_m0, %ub_out_m0[%offset_m0], %mask_m0 : !pto.vreg<128xui16>, !pto.ptr, !pto.mask } } pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg2, %c64_i64 - nburst(%c32_i64, %c64_i64, %c64_i64) + pto.mte_ub_gm %ub_out_m0, %arg2, %c64_i64_m0 + nburst(%c32_i64_m0, %c64_i64_m0, %c64_i64_m0) : !pto.ptr, !pto.ptr, i64, i64, i64, i64 pto.barrier #pto.pipe + + // merged from vshl_i32_unsigned_kernel via LaunchVshl_i32_unsigned_kernel + + %c0_m1 = arith.constant 0 : index + %c0_i64_m1 = arith.constant 0 : i64 + %c1_i64_m1 = arith.constant 1 : i64 + %c32_i64_m1 = arith.constant 32 : i64 + %c64_i64_m1 = arith.constant 64 : i64 + %c128_i64_m1 = arith.constant 128 : i64 + %c64_m1 = arith.constant 64 : index + %c1024_m1 = arith.constant 1024 : index + %c128_m1 = arith.constant 128 : index + %c4096_i64_m1 = arith.constant 4096 : i64 + %c8192_i64_m1 = arith.constant 8192 : i64 + + %ub_lhs_m1 = pto.castptr %c0_i64_m1 : i64 -> !pto.ptr + %ub_rhs_m1 = pto.castptr %c4096_i64_m1 : i64 -> !pto.ptr + %ub_out_m1 = pto.castptr %c8192_i64_m1 : i64 -> !pto.ptr + + %false_m1 = arith.constant false + pto.mte_gm_ub %arg3, %ub_lhs_m1, %c0_i64_m1, %c128_i64_m1 + nburst(%c32_i64_m1, %c128_i64_m1, %c128_i64_m1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + pto.mte_gm_ub %arg4, %ub_rhs_m1, %c0_i64_m1, %c128_i64_m1 + nburst(%c32_i64_m1, %c128_i64_m1, %c128_i64_m1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %mask_m1 = pto.pset_b32 "PAT_ALL" : !pto.mask + scf.for %offset_m1 = %c0_m1 to %c1024_m1 step %c64_m1 { + %lhs_m1 = pto.vlds %ub_lhs_m1[%offset_m1] : !pto.ptr -> !pto.vreg<64xui32> + %rhs_m1 = pto.vlds %ub_rhs_m1[%offset_m1] : !pto.ptr -> !pto.vreg<64xui32> + %out_m1 = pto.vshl %lhs_m1, %rhs_m1, %mask_m1 : !pto.vreg<64xui32>, !pto.vreg<64xui32>, !pto.mask -> !pto.vreg<64xui32> + pto.vsts %out_m1, %ub_out_m1[%offset_m1], %mask_m1 : !pto.vreg<64xui32>, !pto.ptr, !pto.mask + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m1, %arg5, %c128_i64_m1 + nburst(%c32_i64_m1, %c128_i64_m1, %c128_i64_m1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + + // merged from vshl_shift_boundary_kernel via LaunchVshl_shift_boundary_kernel + + %c0_m2 = arith.constant 0 : index + %c0_i64_m2 = arith.constant 0 : i64 + %c1_i64_m2 = arith.constant 1 : i64 + %c32_i64_m2 = arith.constant 32 : i64 + %c64_i64_m2 = arith.constant 64 : i64 + %c1024_m2 = arith.constant 1024 : index + %c128_m2 = arith.constant 128 : index + %c2048_i64_m2 = arith.constant 2048 : i64 + %c4096_i64_m2 = arith.constant 4096 : i64 + + %ub_lhs_m2 = pto.castptr %c0_i64_m2 : i64 -> !pto.ptr + %ub_rhs_m2 = pto.castptr %c2048_i64_m2 : i64 -> !pto.ptr + %ub_out_m2 = pto.castptr %c4096_i64_m2 : i64 -> !pto.ptr + + %false_m2 = arith.constant false + pto.mte_gm_ub %arg6, %ub_lhs_m2, %c0_i64_m2, %c64_i64_m2 + nburst(%c32_i64_m2, %c64_i64_m2, %c64_i64_m2) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + pto.mte_gm_ub %arg7, %ub_rhs_m2, %c0_i64_m2, %c64_i64_m2 + nburst(%c32_i64_m2, %c64_i64_m2, %c64_i64_m2) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %mask_m2 = pto.pset_b16 "PAT_ALL" : !pto.mask + scf.for %offset_m2 = %c0_m2 to %c1024_m2 step %c128_m2 { + %lhs_m2 = pto.vlds %ub_lhs_m2[%offset_m2] : !pto.ptr -> !pto.vreg<128xui16> + %rhs_m2 = pto.vlds %ub_rhs_m2[%offset_m2] : !pto.ptr -> !pto.vreg<128xui16> + %out_m2 = pto.vshl %lhs_m2, %rhs_m2, %mask_m2 : !pto.vreg<128xui16>, !pto.vreg<128xui16>, !pto.mask -> !pto.vreg<128xui16> + pto.vsts %out_m2, %ub_out_m2[%offset_m2], %mask_m2 : !pto.vreg<128xui16>, !pto.ptr, !pto.mask + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m2, %arg8, %c64_i64_m2 + nburst(%c32_i64_m2, %c64_i64_m2, %c64_i64_m2) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + return } } diff --git a/test/vpto/cases/micro-op/binary-vector/vshl/launch.cpp b/test/vpto/cases/micro-op/binary-vector/vshl/launch.cpp index 3abb0a4bf..265d3cfa5 100644 --- a/test/vpto/cases/micro-op/binary-vector/vshl/launch.cpp +++ b/test/vpto/cases/micro-op/binary-vector/vshl/launch.cpp @@ -5,19 +5,10 @@ // THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, // INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. // See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/binary-vector/vshl -// family: binary-vector -// target_ops: pto.vshl -// scenarios: core-i16-unsigned, full-mask -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- +// Merged launch wrappers #ifndef __VEC_SCOPE__ #define __VEC_SCOPE__ #endif - #if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) typedef struct { unsigned char v; } hifloat8_t; typedef struct { unsigned char v; } float8_e4m3_t; @@ -30,14 +21,8 @@ typedef struct { unsigned char v; } float4_e2m1x2_t; #if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) #include #endif - #if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; +struct MrgSortExecutedNumList { uint16_t mrgSortList0,mrgSortList1,mrgSortList2,mrgSortList3; }; #endif #ifndef __CPU_SIM #include "acl/acl.h" @@ -47,9 +32,26 @@ extern "C" __global__ [aicore] void vshl_i16_unsigned_kernel(__gm__ uint16_t *v1 __gm__ uint16_t *v2, __gm__ uint16_t *v3); -void LaunchVshl_i16_unsigned_kernel(uint16_t *v1, uint16_t *v2, uint16_t *v3, - void *stream) { - vshl_i16_unsigned_kernel<<<1, nullptr, stream>>>((__gm__ uint16_t *)v1, - (__gm__ uint16_t *)v2, - (__gm__ uint16_t *)v3); +extern "C" __global__ [aicore] void vshl_deep_merged_kernel( + __gm__ uint16_t * arg0, + __gm__ uint16_t * arg1, + __gm__ uint16_t * arg2, + __gm__ uint32_t * arg3, + __gm__ uint32_t * arg4, + __gm__ uint32_t * arg5, + __gm__ uint16_t * arg6, + __gm__ uint16_t * arg7, + __gm__ uint16_t * arg8); + +void LaunchVshlDeepMerged(uint16_t * p0, uint16_t * p1, uint16_t * p2, uint32_t * p3, uint32_t * p4, uint32_t * p5, uint16_t * p6, uint16_t * p7, uint16_t * p8, void *stream) { + vshl_deep_merged_kernel<<<1, nullptr, stream>>>( + (__gm__ uint16_t *)p0, + (__gm__ uint16_t *)p1, + (__gm__ uint16_t *)p2, + (__gm__ uint32_t *)p3, + (__gm__ uint32_t *)p4, + (__gm__ uint32_t *)p5, + (__gm__ uint16_t *)p6, + (__gm__ uint16_t *)p7, + (__gm__ uint16_t *)p8); } diff --git a/test/vpto/cases/micro-op/binary-vector/vshl/main.cpp b/test/vpto/cases/micro-op/binary-vector/vshl/main.cpp index ce45e4cf1..bc367c9ee 100644 --- a/test/vpto/cases/micro-op/binary-vector/vshl/main.cpp +++ b/test/vpto/cases/micro-op/binary-vector/vshl/main.cpp @@ -5,99 +5,117 @@ // THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, // INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. // See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/binary-vector/vshl -// family: binary-vector -// target_ops: pto.vshl -// scenarios: core-i16-unsigned, full-mask -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -*/ +// Merged vshl test case. #include "test_common.h" #include "acl/acl.h" #include #include - +#include using namespace PtoTestCommon; -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, \ - (int)_ret, __FILE__, __LINE__); \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) +#define ACL_CHECK(expr) do { const aclError _r=(expr); if(_r!=ACL_SUCCESS){std::fprintf(stderr,"[ERROR] %s:%d acle=%d\n",#expr,__LINE__,(int)_r);rc=1;goto cleanup;} }while(0) +#define FCK(expr,path) do{if(!(expr)){std::fprintf(stderr,"[ERROR] file:%s\n",path);rc=1;goto cleanup;}}while(0) + -void LaunchVshl_i16_unsigned_kernel(uint16_t *v1, uint16_t *v2, uint16_t *v3, - void *stream); +void LaunchVshlDeepMerged(uint16_t * p0, uint16_t * p1, uint16_t * p2, uint32_t * p3, uint32_t * p4, uint32_t * p5, uint16_t * p6, uint16_t * p7, uint16_t * p8, void *stream); int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(uint16_t); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(uint16_t); - size_t elemCount_v3 = 1024; - size_t fileSize_v3 = elemCount_v3 * sizeof(uint16_t); - uint16_t *v1Host = nullptr; - uint16_t *v1Device = nullptr; - uint16_t *v2Host = nullptr; - uint16_t *v2Device = nullptr; - uint16_t *v3Host = nullptr; - uint16_t *v3Device = nullptr; - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; + constexpr size_t SZ_f32 = 2048; + constexpr size_t SZ_i32_unsigned = 4096; + constexpr size_t SZ_shift_boundary = 2048; - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) - deviceId = std::atoi(envDevice); - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; + uint16_t *h_f32_v1=nullptr, *d_f32_v1=nullptr; + uint16_t *h_f32_v2=nullptr, *d_f32_v2=nullptr; + uint16_t *h_f32_v3=nullptr, *d_f32_v3=nullptr; + uint32_t *h_i32_unsigned_v1=nullptr, *d_i32_unsigned_v1=nullptr; + uint32_t *h_i32_unsigned_v2=nullptr, *d_i32_unsigned_v2=nullptr; + uint32_t *h_i32_unsigned_v3=nullptr, *d_i32_unsigned_v3=nullptr; + uint16_t *h_shift_boundary_v1=nullptr, *d_shift_boundary_v1=nullptr; + uint16_t *h_shift_boundary_v2=nullptr, *d_shift_boundary_v2=nullptr; + uint16_t *h_shift_boundary_v3=nullptr, *d_shift_boundary_v3=nullptr; + int rc=0; bool aclInited=false,deviceSet=false; int deviceId=0; aclrtStream stream=nullptr; size_t fsz=0; + ACL_CHECK(aclInit(nullptr)); aclInited=true; + if(const char*e=std::getenv("ACL_DEVICE_ID")) deviceId=std::atoi(e); + ACL_CHECK(aclrtSetDevice(deviceId)); deviceSet=true; ACL_CHECK(aclrtCreateStream(&stream)); - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMallocHost((void **)(&v3Host), fileSize_v3)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v3Device, fileSize_v3, ACL_MEM_MALLOC_HUGE_FIRST)); - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ReadFile("./v3.bin", fileSize_v3, v3Host, fileSize_v3); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v3Device, fileSize_v3, v3Host, fileSize_v3, - ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVshl_i16_unsigned_kernel(v1Device, v2Device, v3Device, stream); + ACL_CHECK(aclrtMallocHost((void**)&h_f32_v1,SZ_f32)); + ACL_CHECK(aclrtMallocHost((void**)&h_f32_v2,SZ_f32)); + ACL_CHECK(aclrtMallocHost((void**)&h_f32_v3,SZ_f32)); + ACL_CHECK(aclrtMallocHost((void**)&h_i32_unsigned_v1,SZ_i32_unsigned)); + ACL_CHECK(aclrtMallocHost((void**)&h_i32_unsigned_v2,SZ_i32_unsigned)); + ACL_CHECK(aclrtMallocHost((void**)&h_i32_unsigned_v3,SZ_i32_unsigned)); + ACL_CHECK(aclrtMallocHost((void**)&h_shift_boundary_v1,SZ_shift_boundary)); + ACL_CHECK(aclrtMallocHost((void**)&h_shift_boundary_v2,SZ_shift_boundary)); + ACL_CHECK(aclrtMallocHost((void**)&h_shift_boundary_v3,SZ_shift_boundary)); + ACL_CHECK(aclrtMalloc((void**)&d_f32_v1,SZ_f32,ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc((void**)&d_f32_v2,SZ_f32,ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc((void**)&d_f32_v3,SZ_f32,ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc((void**)&d_i32_unsigned_v1,SZ_i32_unsigned,ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc((void**)&d_i32_unsigned_v2,SZ_i32_unsigned,ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc((void**)&d_i32_unsigned_v3,SZ_i32_unsigned,ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc((void**)&d_shift_boundary_v1,SZ_shift_boundary,ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc((void**)&d_shift_boundary_v2,SZ_shift_boundary,ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc((void**)&d_shift_boundary_v3,SZ_shift_boundary,ACL_MEM_MALLOC_HUGE_FIRST)); + fsz=SZ_f32; FCK(ReadFile("v1.bin",fsz,h_f32_v1,SZ_f32)&&fsz==SZ_f32,"v1.bin"); + fsz=SZ_f32; FCK(ReadFile("v2.bin",fsz,h_f32_v2,SZ_f32)&&fsz==SZ_f32,"v2.bin"); + fsz=SZ_f32; FCK(ReadFile("v3.bin",fsz,h_f32_v3,SZ_f32)&&fsz==SZ_f32,"v3.bin"); + fsz=SZ_i32_unsigned; FCK(ReadFile("v1_i32_unsigned.bin",fsz,h_i32_unsigned_v1,SZ_i32_unsigned)&&fsz==SZ_i32_unsigned,"v1_i32_unsigned.bin"); + fsz=SZ_i32_unsigned; FCK(ReadFile("v2_i32_unsigned.bin",fsz,h_i32_unsigned_v2,SZ_i32_unsigned)&&fsz==SZ_i32_unsigned,"v2_i32_unsigned.bin"); + fsz=SZ_i32_unsigned; FCK(ReadFile("v3_i32_unsigned.bin",fsz,h_i32_unsigned_v3,SZ_i32_unsigned)&&fsz==SZ_i32_unsigned,"v3_i32_unsigned.bin"); + fsz=SZ_shift_boundary; FCK(ReadFile("v1_shift_boundary.bin",fsz,h_shift_boundary_v1,SZ_shift_boundary)&&fsz==SZ_shift_boundary,"v1_shift_boundary.bin"); + fsz=SZ_shift_boundary; FCK(ReadFile("v2_shift_boundary.bin",fsz,h_shift_boundary_v2,SZ_shift_boundary)&&fsz==SZ_shift_boundary,"v2_shift_boundary.bin"); + fsz=SZ_shift_boundary; FCK(ReadFile("v3_shift_boundary.bin",fsz,h_shift_boundary_v3,SZ_shift_boundary)&&fsz==SZ_shift_boundary,"v3_shift_boundary.bin"); + ACL_CHECK(aclrtMemcpy(d_f32_v1,SZ_f32,h_f32_v1,SZ_f32,ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(d_f32_v2,SZ_f32,h_f32_v2,SZ_f32,ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(d_f32_v3,SZ_f32,h_f32_v3,SZ_f32,ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(d_i32_unsigned_v1,SZ_i32_unsigned,h_i32_unsigned_v1,SZ_i32_unsigned,ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(d_i32_unsigned_v2,SZ_i32_unsigned,h_i32_unsigned_v2,SZ_i32_unsigned,ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(d_i32_unsigned_v3,SZ_i32_unsigned,h_i32_unsigned_v3,SZ_i32_unsigned,ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(d_shift_boundary_v1,SZ_shift_boundary,h_shift_boundary_v1,SZ_shift_boundary,ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(d_shift_boundary_v2,SZ_shift_boundary,h_shift_boundary_v2,SZ_shift_boundary,ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(d_shift_boundary_v3,SZ_shift_boundary,h_shift_boundary_v3,SZ_shift_boundary,ACL_MEMCPY_HOST_TO_DEVICE)); + LaunchVshlDeepMerged( + d_f32_v1, + d_f32_v2, + d_f32_v3, + d_i32_unsigned_v1, + d_i32_unsigned_v2, + d_i32_unsigned_v3, + d_shift_boundary_v1, + d_shift_boundary_v2, + d_shift_boundary_v3, + stream + ); ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v3Host, fileSize_v3, v3Device, fileSize_v3, - ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v3.bin", v3Host, fileSize_v3); + ACL_CHECK(aclrtMemcpy(h_f32_v3,SZ_f32,d_f32_v3,SZ_f32,ACL_MEMCPY_DEVICE_TO_HOST)); + ACL_CHECK(aclrtMemcpy(h_i32_unsigned_v3,SZ_i32_unsigned,d_i32_unsigned_v3,SZ_i32_unsigned,ACL_MEMCPY_DEVICE_TO_HOST)); + ACL_CHECK(aclrtMemcpy(h_shift_boundary_v3,SZ_shift_boundary,d_shift_boundary_v3,SZ_shift_boundary,ACL_MEMCPY_DEVICE_TO_HOST)); + FCK(WriteFile("v3.bin",h_f32_v3,SZ_f32),"v3.bin"); + FCK(WriteFile("v3_i32_unsigned.bin",h_i32_unsigned_v3,SZ_i32_unsigned),"v3_i32_unsigned.bin"); + FCK(WriteFile("v3_shift_boundary.bin",h_shift_boundary_v3,SZ_shift_boundary),"v3_shift_boundary.bin"); cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFree(v3Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - aclrtFreeHost(v3Host); - if (stream != nullptr) - aclrtDestroyStream(stream); - if (deviceSet) - aclrtResetDevice(deviceId); - if (aclInited) - aclFinalize(); + aclrtFree(d_f32_v1); + aclrtFree(d_f32_v2); + aclrtFree(d_f32_v3); + aclrtFree(d_i32_unsigned_v1); + aclrtFree(d_i32_unsigned_v2); + aclrtFree(d_i32_unsigned_v3); + aclrtFree(d_shift_boundary_v1); + aclrtFree(d_shift_boundary_v2); + aclrtFree(d_shift_boundary_v3); + aclrtFreeHost(h_f32_v1); + aclrtFreeHost(h_f32_v2); + aclrtFreeHost(h_f32_v3); + aclrtFreeHost(h_i32_unsigned_v1); + aclrtFreeHost(h_i32_unsigned_v2); + aclrtFreeHost(h_i32_unsigned_v3); + aclrtFreeHost(h_shift_boundary_v1); + aclrtFreeHost(h_shift_boundary_v2); + aclrtFreeHost(h_shift_boundary_v3); + if(stream) aclrtDestroyStream(stream); + if(deviceSet) aclrtResetDevice(deviceId); + if(aclInited) aclFinalize(); return rc; } diff --git a/test/vpto/cases/micro-op/binary-vector/vshr-i16-signed/compare.py b/test/vpto/cases/micro-op/binary-vector/vshr-i16-signed/compare.py deleted file mode 100755 index a8ee34d1b..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vshr-i16-signed/compare.py +++ /dev/null @@ -1,44 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/binary-vector/vshr-i16-signed -# family: binary-vector -# target_ops: pto.vshr -# scenarios: core-i16-signed, full-mask -# NOTE: bulk-generated coverage skeleton. - -import os -import sys -import numpy as np - - -def compare_bin_prefix(golden_path, output_path, dtype, eps, count): - if not os.path.exists(golden_path) or not os.path.exists(output_path): - return False - golden = np.fromfile(golden_path, dtype=dtype, count=count) - output = np.fromfile(output_path, dtype=dtype, count=count) - return golden.shape == output.shape and np.allclose( - golden, output, atol=eps, rtol=eps, equal_nan=True - ) - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_bin_prefix("golden_v3.bin", "v3.bin", np.int16, 0, 1024) - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/binary-vector/vshr-i16-signed/golden.py b/test/vpto/cases/micro-op/binary-vector/vshr-i16-signed/golden.py deleted file mode 100755 index 4262e419f..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vshr-i16-signed/golden.py +++ /dev/null @@ -1,50 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/binary-vector/vshr-i16-signed -# family: binary-vector -# target_ops: pto.vshr -# scenarios: core-i16-signed, full-mask -# NOTE: bulk-generated coverage skeleton. -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -ELEMS = 1024 -SEED = 19 - - -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) - v1 = rng.integers(-0x8000, 0x8000, size=ELEMS, dtype=np.int16) - v2 = rng.integers(0, 16, size=ELEMS, dtype=np.int16) - v3 = np.zeros(ELEMS, dtype=np.int16) - golden_v3 = np.right_shift(v1, v2 & np.int16(15)).astype(np.int16, copy=False) - - output_dir.mkdir(parents=True, exist_ok=True) - v1.tofile(output_dir / "v1.bin") - v2.tofile(output_dir / "v2.bin") - v3.tofile(output_dir / "v3.bin") - golden_v3.tofile(output_dir / "golden_v3.bin") - - -def main() -> None: - parser = argparse.ArgumentParser() - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/binary-vector/vshr-i16-signed/kernel.pto b/test/vpto/cases/micro-op/binary-vector/vshr-i16-signed/kernel.pto deleted file mode 100644 index d4e53b05f..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vshr-i16-signed/kernel.pto +++ /dev/null @@ -1,55 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/binary-vector/vshr-i16-signed -// family: binary-vector -// target_ops: pto.vshr -// scenarios: core-i16-signed, full-mask -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vshr_i16_signed_kernel(%arg0: !pto.ptr, %arg1: !pto.ptr, - %arg2: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1024 = arith.constant 1024 : index - %c128 = arith.constant 128 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c64_i64 = arith.constant 64 : i64 - %c2048_i64 = arith.constant 2048 : i64 - %c4096_i64 = arith.constant 4096 : i64 - - %ub_lhs = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_rhs = pto.castptr %c2048_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c4096_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_lhs, %c0_i64, %c64_i64 - nburst(%c32_i64, %c64_i64, %c64_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - pto.mte_gm_ub %arg1, %ub_rhs, %c0_i64, %c64_i64 - nburst(%c32_i64, %c64_i64, %c64_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %mask = pto.pset_b16 "PAT_ALL" : !pto.mask - scf.for %offset = %c0 to %c1024 step %c128 { - %lhs = pto.vlds %ub_lhs[%offset] : !pto.ptr -> !pto.vreg<128xsi16> - %rhs = pto.vlds %ub_rhs[%offset] : !pto.ptr -> !pto.vreg<128xsi16> - %out = pto.vshr %lhs, %rhs, %mask : !pto.vreg<128xsi16>, !pto.vreg<128xsi16>, !pto.mask -> !pto.vreg<128xsi16> - pto.vsts %out, %ub_out[%offset], %mask : !pto.vreg<128xsi16>, !pto.ptr, !pto.mask - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg2, %c64_i64 - nburst(%c32_i64, %c64_i64, %c64_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/binary-vector/vshr-i16-signed/launch.cpp b/test/vpto/cases/micro-op/binary-vector/vshr-i16-signed/launch.cpp deleted file mode 100644 index 6b62ae139..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vshr-i16-signed/launch.cpp +++ /dev/null @@ -1,55 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/binary-vector/vshr-i16-signed -// family: binary-vector -// target_ops: pto.vshr -// scenarios: core-i16-signed, full-mask -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vshr_i16_signed_kernel(__gm__ int16_t *v1, - __gm__ int16_t *v2, - __gm__ int16_t *v3); - -void LaunchVshr_i16_signed_kernel(int16_t *v1, int16_t *v2, int16_t *v3, - void *stream) { - vshr_i16_signed_kernel<<<1, nullptr, stream>>>((__gm__ int16_t *)v1, - (__gm__ int16_t *)v2, - (__gm__ int16_t *)v3); -} diff --git a/test/vpto/cases/micro-op/binary-vector/vshr-i16-signed/main.cpp b/test/vpto/cases/micro-op/binary-vector/vshr-i16-signed/main.cpp deleted file mode 100644 index d1e3520e4..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vshr-i16-signed/main.cpp +++ /dev/null @@ -1,103 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/binary-vector/vshr-i16-signed -// family: binary-vector -// target_ops: pto.vshr -// scenarios: core-i16-signed, full-mask -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include - -using namespace PtoTestCommon; - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, \ - (int)_ret, __FILE__, __LINE__); \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVshr_i16_signed_kernel(int16_t *v1, int16_t *v2, int16_t *v3, - void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(int16_t); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(int16_t); - size_t elemCount_v3 = 1024; - size_t fileSize_v3 = elemCount_v3 * sizeof(int16_t); - int16_t *v1Host = nullptr; - int16_t *v1Device = nullptr; - int16_t *v2Host = nullptr; - int16_t *v2Device = nullptr; - int16_t *v3Host = nullptr; - int16_t *v3Device = nullptr; - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) - deviceId = std::atoi(envDevice); - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMallocHost((void **)(&v3Host), fileSize_v3)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v3Device, fileSize_v3, ACL_MEM_MALLOC_HUGE_FIRST)); - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ReadFile("./v3.bin", fileSize_v3, v3Host, fileSize_v3); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v3Device, fileSize_v3, v3Host, fileSize_v3, - ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVshr_i16_signed_kernel(v1Device, v2Device, v3Device, stream); - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v3Host, fileSize_v3, v3Device, fileSize_v3, - ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v3.bin", v3Host, fileSize_v3); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFree(v3Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - aclrtFreeHost(v3Host); - if (stream != nullptr) - aclrtDestroyStream(stream); - if (deviceSet) - aclrtResetDevice(deviceId); - if (aclInited) - aclFinalize(); - return rc; -} diff --git a/test/vpto/cases/micro-op/binary-vector/vshr-shift-boundary/compare.py b/test/vpto/cases/micro-op/binary-vector/vshr-shift-boundary/compare.py deleted file mode 100755 index f5e74791e..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vshr-shift-boundary/compare.py +++ /dev/null @@ -1,44 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/binary-vector/vshr-shift-boundary -# family: binary-vector -# target_ops: pto.vshr -# scenarios: core-i16-unsigned, full-mask -# NOTE: bulk-generated coverage skeleton. - -import os -import sys -import numpy as np - - -def compare_bin_prefix(golden_path, output_path, dtype, eps, count): - if not os.path.exists(golden_path) or not os.path.exists(output_path): - return False - golden = np.fromfile(golden_path, dtype=dtype, count=count) - output = np.fromfile(output_path, dtype=dtype, count=count) - return golden.shape == output.shape and np.allclose( - golden, output, atol=eps, rtol=eps, equal_nan=True - ) - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_bin_prefix("golden_v3.bin", "v3.bin", np.uint16, 0, 1024) - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/binary-vector/vshr-shift-boundary/golden.py b/test/vpto/cases/micro-op/binary-vector/vshr-shift-boundary/golden.py deleted file mode 100755 index 6c70bedba..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vshr-shift-boundary/golden.py +++ /dev/null @@ -1,51 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/binary-vector/vshr-shift-boundary -# family: binary-vector -# target_ops: pto.vshr -# scenarios: core-i16-unsigned, full-mask -# NOTE: bulk-generated coverage skeleton. -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -ELEMS = 1024 -SEED = 19 - - -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) - v1 = rng.integers(0, 1 << 16, size=ELEMS, dtype=np.uint16) - shift_cycle = np.array([0, 1, 14, 15, 15, 14, 1, 0], dtype=np.uint16) - v2 = np.resize(shift_cycle, ELEMS).astype(np.uint16, copy=False) - v3 = np.zeros(ELEMS, dtype=np.uint16) - golden_v3 = np.right_shift(v1, v2 & np.uint16(15)).astype(np.uint16, copy=False) - - output_dir.mkdir(parents=True, exist_ok=True) - v1.tofile(output_dir / "v1.bin") - v2.tofile(output_dir / "v2.bin") - v3.tofile(output_dir / "v3.bin") - golden_v3.tofile(output_dir / "golden_v3.bin") - - -def main() -> None: - parser = argparse.ArgumentParser() - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/binary-vector/vshr-shift-boundary/kernel.pto b/test/vpto/cases/micro-op/binary-vector/vshr-shift-boundary/kernel.pto deleted file mode 100644 index baa1e93c1..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vshr-shift-boundary/kernel.pto +++ /dev/null @@ -1,55 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/binary-vector/vshr-shift-boundary -// family: binary-vector -// target_ops: pto.vshr -// scenarios: core-i16-unsigned, full-mask -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vshr_shift_boundary_kernel(%arg0: !pto.ptr, %arg1: !pto.ptr, - %arg2: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c64_i64 = arith.constant 64 : i64 - %c1024 = arith.constant 1024 : index - %c128 = arith.constant 128 : index - %c2048_i64 = arith.constant 2048 : i64 - %c4096_i64 = arith.constant 4096 : i64 - - %ub_lhs = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_rhs = pto.castptr %c2048_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c4096_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_lhs, %c0_i64, %c64_i64 - nburst(%c32_i64, %c64_i64, %c64_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - pto.mte_gm_ub %arg1, %ub_rhs, %c0_i64, %c64_i64 - nburst(%c32_i64, %c64_i64, %c64_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %mask = pto.pset_b16 "PAT_ALL" : !pto.mask - scf.for %offset = %c0 to %c1024 step %c128 { - %lhs = pto.vlds %ub_lhs[%offset] : !pto.ptr -> !pto.vreg<128xui16> - %rhs = pto.vlds %ub_rhs[%offset] : !pto.ptr -> !pto.vreg<128xui16> - %out = pto.vshr %lhs, %rhs, %mask : !pto.vreg<128xui16>, !pto.vreg<128xui16>, !pto.mask -> !pto.vreg<128xui16> - pto.vsts %out, %ub_out[%offset], %mask : !pto.vreg<128xui16>, !pto.ptr, !pto.mask - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg2, %c64_i64 - nburst(%c32_i64, %c64_i64, %c64_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/binary-vector/vshr-shift-boundary/launch.cpp b/test/vpto/cases/micro-op/binary-vector/vshr-shift-boundary/launch.cpp deleted file mode 100644 index a24d8e08c..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vshr-shift-boundary/launch.cpp +++ /dev/null @@ -1,55 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/binary-vector/vshr-shift-boundary -// family: binary-vector -// target_ops: pto.vshr -// scenarios: core-i16-unsigned, full-mask -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vshr_shift_boundary_kernel(__gm__ uint16_t *v1, - __gm__ uint16_t *v2, - __gm__ uint16_t *v3); - -void LaunchVshr_shift_boundary_kernel(uint16_t *v1, uint16_t *v2, uint16_t *v3, - void *stream) { - vshr_shift_boundary_kernel<<<1, nullptr, stream>>>((__gm__ uint16_t *)v1, - (__gm__ uint16_t *)v2, - (__gm__ uint16_t *)v3); -} diff --git a/test/vpto/cases/micro-op/binary-vector/vshr-shift-boundary/main.cpp b/test/vpto/cases/micro-op/binary-vector/vshr-shift-boundary/main.cpp deleted file mode 100644 index e8d1e1459..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vshr-shift-boundary/main.cpp +++ /dev/null @@ -1,103 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/binary-vector/vshr-shift-boundary -// family: binary-vector -// target_ops: pto.vshr -// scenarios: core-i16-unsigned, full-mask -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include - -using namespace PtoTestCommon; - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, \ - (int)_ret, __FILE__, __LINE__); \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVshr_shift_boundary_kernel(uint16_t *v1, uint16_t *v2, uint16_t *v3, - void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(uint16_t); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(uint16_t); - size_t elemCount_v3 = 1024; - size_t fileSize_v3 = elemCount_v3 * sizeof(uint16_t); - uint16_t *v1Host = nullptr; - uint16_t *v1Device = nullptr; - uint16_t *v2Host = nullptr; - uint16_t *v2Device = nullptr; - uint16_t *v3Host = nullptr; - uint16_t *v3Device = nullptr; - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) - deviceId = std::atoi(envDevice); - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMallocHost((void **)(&v3Host), fileSize_v3)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v3Device, fileSize_v3, ACL_MEM_MALLOC_HUGE_FIRST)); - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ReadFile("./v3.bin", fileSize_v3, v3Host, fileSize_v3); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v3Device, fileSize_v3, v3Host, fileSize_v3, - ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVshr_shift_boundary_kernel(v1Device, v2Device, v3Device, stream); - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v3Host, fileSize_v3, v3Device, fileSize_v3, - ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v3.bin", v3Host, fileSize_v3); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFree(v3Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - aclrtFreeHost(v3Host); - if (stream != nullptr) - aclrtDestroyStream(stream); - if (deviceSet) - aclrtResetDevice(deviceId); - if (aclInited) - aclFinalize(); - return rc; -} diff --git a/test/vpto/cases/micro-op/binary-vector/vshr/compare.py b/test/vpto/cases/micro-op/binary-vector/vshr/compare.py index a2429ec9b..8c674e429 100755 --- a/test/vpto/cases/micro-op/binary-vector/vshr/compare.py +++ b/test/vpto/cases/micro-op/binary-vector/vshr/compare.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python3 +#!/usr/bin/python3 # Copyright (c) 2026 Huawei Technologies Co., Ltd. # This program is free software, you can redistribute it and/or modify it under the terms and conditions of # CANN Open Software License Agreement Version 2.0 (the "License"). @@ -7,38 +7,47 @@ # INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. # See LICENSE in the root of the software repository for the full text of the License. -# case: micro-op/binary-vector/vshr -# family: binary-vector -# target_ops: pto.vshr -# scenarios: core-i16-unsigned, full-mask -# NOTE: bulk-generated coverage skeleton. +# Merged vshr test case. -import os -import sys +import os,sys import numpy as np +def _cmp(golden,output,dtype,eps,count=-1): + if not os.path.exists(golden) or not os.path.exists(output): return False + kw={} if count<0 else {"count":count} + g=np.fromfile(golden,dtype=dtype,**kw) + o=np.fromfile(output,dtype=dtype,**kw) + return g.shape==o.shape and np.allclose(g,o,atol=eps,rtol=eps,equal_nan=True) -def compare_bin_prefix(golden_path, output_path, dtype, eps, count): - if not os.path.exists(golden_path) or not os.path.exists(output_path): - return False - golden = np.fromfile(golden_path, dtype=dtype, count=count) - output = np.fromfile(output_path, dtype=dtype, count=count) - return golden.shape == output.shape and np.allclose( - golden, output, atol=eps, rtol=eps, equal_nan=True - ) - +def _cmpeq(golden,output,dtype): + if not os.path.exists(golden) or not os.path.exists(output): return False + g=np.fromfile(golden,dtype=dtype) + o=np.fromfile(output,dtype=dtype) + return g.shape==o.shape and np.array_equal(g,o) def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_bin_prefix("golden_v3.bin", "v3.bin", np.uint16, 0, 1024) - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") + strict=os.getenv('COMPARE_STRICT','1')!='0' + failed=[] + if not (_cmp("golden_v3.bin","v3.bin",np.uint16,0,1024)): + failed.append('f32') + print('[ERROR] compare failed: f32') + else: + print('[INFO] f32: passed') + if not (_cmp("golden_v3_i16_signed.bin","v3_i16_signed.bin",np.int16,0,1024)): + failed.append('i16_signed') + print('[ERROR] compare failed: i16_signed') + else: + print('[INFO] i16_signed: passed') + if not (_cmpeq("golden_v3_shift_boundary.bin","v3_shift_boundary.bin",np.uint16)): + failed.append('shift_boundary') + print('[ERROR] compare failed: shift_boundary') + else: + print('[INFO] shift_boundary: passed') + if failed: + if strict: print(f"[ERROR] {len(failed)} variant(s) failed"); sys.exit(2) + print(f"[WARN] {len(failed)} variant(s) failed (non-gating)") return - print("[INFO] compare passed") - + print("[INFO] compare passed (all 3 variants)") -if __name__ == "__main__": +if __name__=="__main__": main() diff --git a/test/vpto/cases/micro-op/binary-vector/vshr/golden.py b/test/vpto/cases/micro-op/binary-vector/vshr/golden.py index bd0cda8b9..c2909ebad 100755 --- a/test/vpto/cases/micro-op/binary-vector/vshr/golden.py +++ b/test/vpto/cases/micro-op/binary-vector/vshr/golden.py @@ -7,44 +7,78 @@ # INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. # See LICENSE in the root of the software repository for the full text of the License. -# case: micro-op/binary-vector/vshr -# family: binary-vector -# target_ops: pto.vshr -# scenarios: core-i16-unsigned, full-mask -# NOTE: bulk-generated coverage skeleton. -# coding=utf-8 +# Merged vshr test case. import argparse from pathlib import Path - import numpy as np - -ELEMS = 1024 +ROWS = 32 +COLS = 32 SEED = 19 +LOGICAL_ELEMS = 1000 +OUT_SENTINEL = np.float32(-123.25) + +def f32_to_bf16_bits(v): + w=v.astype(np.float32,copy=False).view(np.uint32) + r=np.uint32(0x7FFF)+((w>>16)&np.uint32(1)) + return ((w+r)>>16).astype(np.uint16) +def bf16_bits_to_f32(b): + return (b.astype(np.uint32)<<16).view(np.float32) -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) - v1 = rng.integers(0, 0x10000, size=ELEMS, dtype=np.uint16) - v2 = rng.integers(0, 16, size=ELEMS, dtype=np.uint16) - v3 = np.zeros(ELEMS, dtype=np.uint16) - golden_v3 = np.right_shift(v1, v2 & np.uint16(15)).astype(np.uint16, copy=False) +# ---- f32 ---- +def gen_f32(out, rng): + v1=rng.integers(0,0x10000,size=(ROWS,COLS),dtype=np.uint16) + v2=rng.integers(0,16,size=(ROWS,COLS),dtype=np.uint16) + g=np.right_shift(v1.astype(np.uint16),v2.astype(np.uint16)) + v3=np.zeros((ROWS,COLS),dtype=np.uint16) + v1.reshape(-1).tofile(out/"v1.bin") + v2.reshape(-1).tofile(out/"v2.bin") + v3.reshape(-1).tofile(out/"v3.bin") + g.reshape(-1).tofile(out/"golden_v3.bin") - output_dir.mkdir(parents=True, exist_ok=True) - v1.tofile(output_dir / "v1.bin") - v2.tofile(output_dir / "v2.bin") - v3.tofile(output_dir / "v3.bin") - golden_v3.tofile(output_dir / "golden_v3.bin") +# ---- i16_signed ---- +def gen_i16_signed(out, rng): + v1=rng.integers(-1000,1001,size=(ROWS,COLS),dtype=np.int16) + v2=rng.integers(0,16,size=(ROWS,COLS),dtype=np.int16) + g=np.right_shift(v1.astype(np.int32),v2.astype(np.int32)).astype(np.int16) + v3=np.zeros((ROWS,COLS),dtype=np.int16) + v1.reshape(-1).tofile(out/"v1_i16_signed.bin") + v2.reshape(-1).tofile(out/"v2_i16_signed.bin") + v3.reshape(-1).tofile(out/"v3_i16_signed.bin") + g.reshape(-1).tofile(out/"golden_v3_i16_signed.bin") +# ---- shift_boundary ---- +def gen_shift_boundary(out, rng): + elems=ROWS*COLS + lhs_pat=np.array([0,1,15,16,255,256,4095,4096,32767,32768,65535,0],dtype=np.uint16) + rhs_pat=np.array([0,1,1,1,2,2,3,3,4,5,6,15],dtype=np.uint16) + reps=elems//lhs_pat.size + v1=np.resize(lhs_pat,elems) + v2=np.resize(rhs_pat,elems) + g=np.right_shift(v1.astype(np.uint16),v2.astype(np.uint16)) + v3=np.zeros(elems,dtype=np.uint16) + v1.tofile(out/"v1_shift_boundary.bin") + v2.tofile(out/"v2_shift_boundary.bin") + v3.tofile(out/"v3_shift_boundary.bin") + g.tofile(out/"golden_v3_shift_boundary.bin") -def main() -> None: - parser = argparse.ArgumentParser() - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) +GENERATORS = [ + gen_f32, + gen_i16_signed, + gen_shift_boundary, +] +def main(): + p=argparse.ArgumentParser() + p.add_argument("--output-dir",type=Path,default=Path(".")) + p.add_argument("--seed",type=int,default=SEED) + a=p.parse_args() + rng=np.random.default_rng(a.seed) + out=a.output_dir; out.mkdir(parents=True,exist_ok=True) + for gen in GENERATORS: + gen(out,rng) -if __name__ == "__main__": +if __name__=="__main__": main() diff --git a/test/vpto/cases/micro-op/binary-vector/vshr/kernel.pto b/test/vpto/cases/micro-op/binary-vector/vshr/kernel.pto index cd7db0019..577378f09 100644 --- a/test/vpto/cases/micro-op/binary-vector/vshr/kernel.pto +++ b/test/vpto/cases/micro-op/binary-vector/vshr/kernel.pto @@ -1,55 +1,137 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/binary-vector/vshr -// family: binary-vector -// target_ops: pto.vshr -// scenarios: core-i16-unsigned, full-mask -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vshr_i16_unsigned_kernel(%arg0: !pto.ptr, %arg1: !pto.ptr, - %arg2: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1024 = arith.constant 1024 : index - %c128 = arith.constant 128 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c64_i64 = arith.constant 64 : i64 - %c2048_i64 = arith.constant 2048 : i64 - %c4096_i64 = arith.constant 4096 : i64 - - %ub_lhs = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_rhs = pto.castptr %c2048_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c4096_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_lhs, %c0_i64, %c64_i64 - nburst(%c32_i64, %c64_i64, %c64_i64) + func.func @vshr_deep_merged_kernel(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: !pto.ptr, %arg4: !pto.ptr, %arg5: !pto.ptr, %arg6: !pto.ptr, %arg7: !pto.ptr, %arg8: !pto.ptr) attributes {pto.kernel} { + // merged from vshr_i16_unsigned_kernel via LaunchVshr_i16_unsigned_kernel + + %c0_m0 = arith.constant 0 : index + %c1024_m0 = arith.constant 1024 : index + %c128_m0 = arith.constant 128 : index + %c0_i64_m0 = arith.constant 0 : i64 + %c1_i64_m0 = arith.constant 1 : i64 + %c32_i64_m0 = arith.constant 32 : i64 + %c64_i64_m0 = arith.constant 64 : i64 + %c2048_i64_m0 = arith.constant 2048 : i64 + %c4096_i64_m0 = arith.constant 4096 : i64 + + %ub_lhs_m0 = pto.castptr %c0_i64_m0 : i64 -> !pto.ptr + %ub_rhs_m0 = pto.castptr %c2048_i64_m0 : i64 -> !pto.ptr + %ub_out_m0 = pto.castptr %c4096_i64_m0 : i64 -> !pto.ptr + + %false_m0 = arith.constant false + pto.mte_gm_ub %arg0, %ub_lhs_m0, %c0_i64_m0, %c64_i64_m0 + nburst(%c32_i64_m0, %c64_i64_m0, %c64_i64_m0) : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - pto.mte_gm_ub %arg1, %ub_rhs, %c0_i64, %c64_i64 - nburst(%c32_i64, %c64_i64, %c64_i64) + pto.mte_gm_ub %arg1, %ub_rhs_m0, %c0_i64_m0, %c64_i64_m0 + nburst(%c32_i64_m0, %c64_i64_m0, %c64_i64_m0) : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] pto.vecscope { - %mask = pto.pset_b16 "PAT_ALL" : !pto.mask - scf.for %offset = %c0 to %c1024 step %c128 { - %lhs = pto.vlds %ub_lhs[%offset] : !pto.ptr -> !pto.vreg<128xui16> - %rhs = pto.vlds %ub_rhs[%offset] : !pto.ptr -> !pto.vreg<128xui16> - %out = pto.vshr %lhs, %rhs, %mask : !pto.vreg<128xui16>, !pto.vreg<128xui16>, !pto.mask -> !pto.vreg<128xui16> - pto.vsts %out, %ub_out[%offset], %mask : !pto.vreg<128xui16>, !pto.ptr, !pto.mask + %mask_m0 = pto.pset_b16 "PAT_ALL" : !pto.mask + scf.for %offset_m0 = %c0_m0 to %c1024_m0 step %c128_m0 { + %lhs_m0 = pto.vlds %ub_lhs_m0[%offset_m0] : !pto.ptr -> !pto.vreg<128xui16> + %rhs_m0 = pto.vlds %ub_rhs_m0[%offset_m0] : !pto.ptr -> !pto.vreg<128xui16> + %out_m0 = pto.vshr %lhs_m0, %rhs_m0, %mask_m0 : !pto.vreg<128xui16>, !pto.vreg<128xui16>, !pto.mask -> !pto.vreg<128xui16> + pto.vsts %out_m0, %ub_out_m0[%offset_m0], %mask_m0 : !pto.vreg<128xui16>, !pto.ptr, !pto.mask } } pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg2, %c64_i64 - nburst(%c32_i64, %c64_i64, %c64_i64) + pto.mte_ub_gm %ub_out_m0, %arg2, %c64_i64_m0 + nburst(%c32_i64_m0, %c64_i64_m0, %c64_i64_m0) : !pto.ptr, !pto.ptr, i64, i64, i64, i64 pto.barrier #pto.pipe + + // merged from vshr_i16_signed_kernel via LaunchVshr_i16_signed_kernel + + %c0_m1 = arith.constant 0 : index + %c1024_m1 = arith.constant 1024 : index + %c128_m1 = arith.constant 128 : index + %c0_i64_m1 = arith.constant 0 : i64 + %c1_i64_m1 = arith.constant 1 : i64 + %c32_i64_m1 = arith.constant 32 : i64 + %c64_i64_m1 = arith.constant 64 : i64 + %c2048_i64_m1 = arith.constant 2048 : i64 + %c4096_i64_m1 = arith.constant 4096 : i64 + + %ub_lhs_m1 = pto.castptr %c0_i64_m1 : i64 -> !pto.ptr + %ub_rhs_m1 = pto.castptr %c2048_i64_m1 : i64 -> !pto.ptr + %ub_out_m1 = pto.castptr %c4096_i64_m1 : i64 -> !pto.ptr + + %false_m1 = arith.constant false + pto.mte_gm_ub %arg3, %ub_lhs_m1, %c0_i64_m1, %c64_i64_m1 + nburst(%c32_i64_m1, %c64_i64_m1, %c64_i64_m1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + pto.mte_gm_ub %arg4, %ub_rhs_m1, %c0_i64_m1, %c64_i64_m1 + nburst(%c32_i64_m1, %c64_i64_m1, %c64_i64_m1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %mask_m1 = pto.pset_b16 "PAT_ALL" : !pto.mask + scf.for %offset_m1 = %c0_m1 to %c1024_m1 step %c128_m1 { + %lhs_m1 = pto.vlds %ub_lhs_m1[%offset_m1] : !pto.ptr -> !pto.vreg<128xsi16> + %rhs_m1 = pto.vlds %ub_rhs_m1[%offset_m1] : !pto.ptr -> !pto.vreg<128xsi16> + %out_m1 = pto.vshr %lhs_m1, %rhs_m1, %mask_m1 : !pto.vreg<128xsi16>, !pto.vreg<128xsi16>, !pto.mask -> !pto.vreg<128xsi16> + pto.vsts %out_m1, %ub_out_m1[%offset_m1], %mask_m1 : !pto.vreg<128xsi16>, !pto.ptr, !pto.mask + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m1, %arg5, %c64_i64_m1 + nburst(%c32_i64_m1, %c64_i64_m1, %c64_i64_m1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + + // merged from vshr_shift_boundary_kernel via LaunchVshr_shift_boundary_kernel + + %c0_m2 = arith.constant 0 : index + %c0_i64_m2 = arith.constant 0 : i64 + %c1_i64_m2 = arith.constant 1 : i64 + %c32_i64_m2 = arith.constant 32 : i64 + %c64_i64_m2 = arith.constant 64 : i64 + %c1024_m2 = arith.constant 1024 : index + %c128_m2 = arith.constant 128 : index + %c2048_i64_m2 = arith.constant 2048 : i64 + %c4096_i64_m2 = arith.constant 4096 : i64 + + %ub_lhs_m2 = pto.castptr %c0_i64_m2 : i64 -> !pto.ptr + %ub_rhs_m2 = pto.castptr %c2048_i64_m2 : i64 -> !pto.ptr + %ub_out_m2 = pto.castptr %c4096_i64_m2 : i64 -> !pto.ptr + + %false_m2 = arith.constant false + pto.mte_gm_ub %arg6, %ub_lhs_m2, %c0_i64_m2, %c64_i64_m2 + nburst(%c32_i64_m2, %c64_i64_m2, %c64_i64_m2) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + pto.mte_gm_ub %arg7, %ub_rhs_m2, %c0_i64_m2, %c64_i64_m2 + nburst(%c32_i64_m2, %c64_i64_m2, %c64_i64_m2) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %mask_m2 = pto.pset_b16 "PAT_ALL" : !pto.mask + scf.for %offset_m2 = %c0_m2 to %c1024_m2 step %c128_m2 { + %lhs_m2 = pto.vlds %ub_lhs_m2[%offset_m2] : !pto.ptr -> !pto.vreg<128xui16> + %rhs_m2 = pto.vlds %ub_rhs_m2[%offset_m2] : !pto.ptr -> !pto.vreg<128xui16> + %out_m2 = pto.vshr %lhs_m2, %rhs_m2, %mask_m2 : !pto.vreg<128xui16>, !pto.vreg<128xui16>, !pto.mask -> !pto.vreg<128xui16> + pto.vsts %out_m2, %ub_out_m2[%offset_m2], %mask_m2 : !pto.vreg<128xui16>, !pto.ptr, !pto.mask + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m2, %arg8, %c64_i64_m2 + nburst(%c32_i64_m2, %c64_i64_m2, %c64_i64_m2) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + return } } diff --git a/test/vpto/cases/micro-op/binary-vector/vshr/launch.cpp b/test/vpto/cases/micro-op/binary-vector/vshr/launch.cpp index 08208c24c..26b631c80 100644 --- a/test/vpto/cases/micro-op/binary-vector/vshr/launch.cpp +++ b/test/vpto/cases/micro-op/binary-vector/vshr/launch.cpp @@ -5,19 +5,10 @@ // THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, // INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. // See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/binary-vector/vshr -// family: binary-vector -// target_ops: pto.vshr -// scenarios: core-i16-unsigned, full-mask -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- +// Merged launch wrappers #ifndef __VEC_SCOPE__ #define __VEC_SCOPE__ #endif - #if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) typedef struct { unsigned char v; } hifloat8_t; typedef struct { unsigned char v; } float8_e4m3_t; @@ -30,14 +21,8 @@ typedef struct { unsigned char v; } float4_e2m1x2_t; #if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) #include #endif - #if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; +struct MrgSortExecutedNumList { uint16_t mrgSortList0,mrgSortList1,mrgSortList2,mrgSortList3; }; #endif #ifndef __CPU_SIM #include "acl/acl.h" @@ -47,9 +32,26 @@ extern "C" __global__ [aicore] void vshr_i16_unsigned_kernel(__gm__ uint16_t *v1 __gm__ uint16_t *v2, __gm__ uint16_t *v3); -void LaunchVshr_i16_unsigned_kernel(uint16_t *v1, uint16_t *v2, uint16_t *v3, - void *stream) { - vshr_i16_unsigned_kernel<<<1, nullptr, stream>>>((__gm__ uint16_t *)v1, - (__gm__ uint16_t *)v2, - (__gm__ uint16_t *)v3); +extern "C" __global__ [aicore] void vshr_deep_merged_kernel( + __gm__ uint16_t * arg0, + __gm__ uint16_t * arg1, + __gm__ uint16_t * arg2, + __gm__ int16_t * arg3, + __gm__ int16_t * arg4, + __gm__ int16_t * arg5, + __gm__ uint16_t * arg6, + __gm__ uint16_t * arg7, + __gm__ uint16_t * arg8); + +void LaunchVshrDeepMerged(uint16_t * p0, uint16_t * p1, uint16_t * p2, int16_t * p3, int16_t * p4, int16_t * p5, uint16_t * p6, uint16_t * p7, uint16_t * p8, void *stream) { + vshr_deep_merged_kernel<<<1, nullptr, stream>>>( + (__gm__ uint16_t *)p0, + (__gm__ uint16_t *)p1, + (__gm__ uint16_t *)p2, + (__gm__ int16_t *)p3, + (__gm__ int16_t *)p4, + (__gm__ int16_t *)p5, + (__gm__ uint16_t *)p6, + (__gm__ uint16_t *)p7, + (__gm__ uint16_t *)p8); } diff --git a/test/vpto/cases/micro-op/binary-vector/vshr/main.cpp b/test/vpto/cases/micro-op/binary-vector/vshr/main.cpp index fcdf7cbc0..78e411367 100644 --- a/test/vpto/cases/micro-op/binary-vector/vshr/main.cpp +++ b/test/vpto/cases/micro-op/binary-vector/vshr/main.cpp @@ -5,99 +5,117 @@ // THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, // INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. // See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/binary-vector/vshr -// family: binary-vector -// target_ops: pto.vshr -// scenarios: core-i16-unsigned, full-mask -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -*/ +// Merged vshr test case. #include "test_common.h" #include "acl/acl.h" #include #include - +#include using namespace PtoTestCommon; -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, \ - (int)_ret, __FILE__, __LINE__); \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) +#define ACL_CHECK(expr) do { const aclError _r=(expr); if(_r!=ACL_SUCCESS){std::fprintf(stderr,"[ERROR] %s:%d acle=%d\n",#expr,__LINE__,(int)_r);rc=1;goto cleanup;} }while(0) +#define FCK(expr,path) do{if(!(expr)){std::fprintf(stderr,"[ERROR] file:%s\n",path);rc=1;goto cleanup;}}while(0) + -void LaunchVshr_i16_unsigned_kernel(uint16_t *v1, uint16_t *v2, uint16_t *v3, - void *stream); +void LaunchVshrDeepMerged(uint16_t * p0, uint16_t * p1, uint16_t * p2, int16_t * p3, int16_t * p4, int16_t * p5, uint16_t * p6, uint16_t * p7, uint16_t * p8, void *stream); int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(uint16_t); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(uint16_t); - size_t elemCount_v3 = 1024; - size_t fileSize_v3 = elemCount_v3 * sizeof(uint16_t); - uint16_t *v1Host = nullptr; - uint16_t *v1Device = nullptr; - uint16_t *v2Host = nullptr; - uint16_t *v2Device = nullptr; - uint16_t *v3Host = nullptr; - uint16_t *v3Device = nullptr; - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; + constexpr size_t SZ_f32 = 2048; + constexpr size_t SZ_i16_signed = 2048; + constexpr size_t SZ_shift_boundary = 2048; - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) - deviceId = std::atoi(envDevice); - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; + uint16_t *h_f32_v1=nullptr, *d_f32_v1=nullptr; + uint16_t *h_f32_v2=nullptr, *d_f32_v2=nullptr; + uint16_t *h_f32_v3=nullptr, *d_f32_v3=nullptr; + int16_t *h_i16_signed_v1=nullptr, *d_i16_signed_v1=nullptr; + int16_t *h_i16_signed_v2=nullptr, *d_i16_signed_v2=nullptr; + int16_t *h_i16_signed_v3=nullptr, *d_i16_signed_v3=nullptr; + uint16_t *h_shift_boundary_v1=nullptr, *d_shift_boundary_v1=nullptr; + uint16_t *h_shift_boundary_v2=nullptr, *d_shift_boundary_v2=nullptr; + uint16_t *h_shift_boundary_v3=nullptr, *d_shift_boundary_v3=nullptr; + int rc=0; bool aclInited=false,deviceSet=false; int deviceId=0; aclrtStream stream=nullptr; size_t fsz=0; + ACL_CHECK(aclInit(nullptr)); aclInited=true; + if(const char*e=std::getenv("ACL_DEVICE_ID")) deviceId=std::atoi(e); + ACL_CHECK(aclrtSetDevice(deviceId)); deviceSet=true; ACL_CHECK(aclrtCreateStream(&stream)); - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMallocHost((void **)(&v3Host), fileSize_v3)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v3Device, fileSize_v3, ACL_MEM_MALLOC_HUGE_FIRST)); - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ReadFile("./v3.bin", fileSize_v3, v3Host, fileSize_v3); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v3Device, fileSize_v3, v3Host, fileSize_v3, - ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVshr_i16_unsigned_kernel(v1Device, v2Device, v3Device, stream); + ACL_CHECK(aclrtMallocHost((void**)&h_f32_v1,SZ_f32)); + ACL_CHECK(aclrtMallocHost((void**)&h_f32_v2,SZ_f32)); + ACL_CHECK(aclrtMallocHost((void**)&h_f32_v3,SZ_f32)); + ACL_CHECK(aclrtMallocHost((void**)&h_i16_signed_v1,SZ_i16_signed)); + ACL_CHECK(aclrtMallocHost((void**)&h_i16_signed_v2,SZ_i16_signed)); + ACL_CHECK(aclrtMallocHost((void**)&h_i16_signed_v3,SZ_i16_signed)); + ACL_CHECK(aclrtMallocHost((void**)&h_shift_boundary_v1,SZ_shift_boundary)); + ACL_CHECK(aclrtMallocHost((void**)&h_shift_boundary_v2,SZ_shift_boundary)); + ACL_CHECK(aclrtMallocHost((void**)&h_shift_boundary_v3,SZ_shift_boundary)); + ACL_CHECK(aclrtMalloc((void**)&d_f32_v1,SZ_f32,ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc((void**)&d_f32_v2,SZ_f32,ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc((void**)&d_f32_v3,SZ_f32,ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc((void**)&d_i16_signed_v1,SZ_i16_signed,ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc((void**)&d_i16_signed_v2,SZ_i16_signed,ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc((void**)&d_i16_signed_v3,SZ_i16_signed,ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc((void**)&d_shift_boundary_v1,SZ_shift_boundary,ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc((void**)&d_shift_boundary_v2,SZ_shift_boundary,ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc((void**)&d_shift_boundary_v3,SZ_shift_boundary,ACL_MEM_MALLOC_HUGE_FIRST)); + fsz=SZ_f32; FCK(ReadFile("v1.bin",fsz,h_f32_v1,SZ_f32)&&fsz==SZ_f32,"v1.bin"); + fsz=SZ_f32; FCK(ReadFile("v2.bin",fsz,h_f32_v2,SZ_f32)&&fsz==SZ_f32,"v2.bin"); + fsz=SZ_f32; FCK(ReadFile("v3.bin",fsz,h_f32_v3,SZ_f32)&&fsz==SZ_f32,"v3.bin"); + fsz=SZ_i16_signed; FCK(ReadFile("v1_i16_signed.bin",fsz,h_i16_signed_v1,SZ_i16_signed)&&fsz==SZ_i16_signed,"v1_i16_signed.bin"); + fsz=SZ_i16_signed; FCK(ReadFile("v2_i16_signed.bin",fsz,h_i16_signed_v2,SZ_i16_signed)&&fsz==SZ_i16_signed,"v2_i16_signed.bin"); + fsz=SZ_i16_signed; FCK(ReadFile("v3_i16_signed.bin",fsz,h_i16_signed_v3,SZ_i16_signed)&&fsz==SZ_i16_signed,"v3_i16_signed.bin"); + fsz=SZ_shift_boundary; FCK(ReadFile("v1_shift_boundary.bin",fsz,h_shift_boundary_v1,SZ_shift_boundary)&&fsz==SZ_shift_boundary,"v1_shift_boundary.bin"); + fsz=SZ_shift_boundary; FCK(ReadFile("v2_shift_boundary.bin",fsz,h_shift_boundary_v2,SZ_shift_boundary)&&fsz==SZ_shift_boundary,"v2_shift_boundary.bin"); + fsz=SZ_shift_boundary; FCK(ReadFile("v3_shift_boundary.bin",fsz,h_shift_boundary_v3,SZ_shift_boundary)&&fsz==SZ_shift_boundary,"v3_shift_boundary.bin"); + ACL_CHECK(aclrtMemcpy(d_f32_v1,SZ_f32,h_f32_v1,SZ_f32,ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(d_f32_v2,SZ_f32,h_f32_v2,SZ_f32,ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(d_f32_v3,SZ_f32,h_f32_v3,SZ_f32,ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(d_i16_signed_v1,SZ_i16_signed,h_i16_signed_v1,SZ_i16_signed,ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(d_i16_signed_v2,SZ_i16_signed,h_i16_signed_v2,SZ_i16_signed,ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(d_i16_signed_v3,SZ_i16_signed,h_i16_signed_v3,SZ_i16_signed,ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(d_shift_boundary_v1,SZ_shift_boundary,h_shift_boundary_v1,SZ_shift_boundary,ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(d_shift_boundary_v2,SZ_shift_boundary,h_shift_boundary_v2,SZ_shift_boundary,ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy(d_shift_boundary_v3,SZ_shift_boundary,h_shift_boundary_v3,SZ_shift_boundary,ACL_MEMCPY_HOST_TO_DEVICE)); + LaunchVshrDeepMerged( + d_f32_v1, + d_f32_v2, + d_f32_v3, + d_i16_signed_v1, + d_i16_signed_v2, + d_i16_signed_v3, + d_shift_boundary_v1, + d_shift_boundary_v2, + d_shift_boundary_v3, + stream + ); ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v3Host, fileSize_v3, v3Device, fileSize_v3, - ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v3.bin", v3Host, fileSize_v3); + ACL_CHECK(aclrtMemcpy(h_f32_v3,SZ_f32,d_f32_v3,SZ_f32,ACL_MEMCPY_DEVICE_TO_HOST)); + ACL_CHECK(aclrtMemcpy(h_i16_signed_v3,SZ_i16_signed,d_i16_signed_v3,SZ_i16_signed,ACL_MEMCPY_DEVICE_TO_HOST)); + ACL_CHECK(aclrtMemcpy(h_shift_boundary_v3,SZ_shift_boundary,d_shift_boundary_v3,SZ_shift_boundary,ACL_MEMCPY_DEVICE_TO_HOST)); + FCK(WriteFile("v3.bin",h_f32_v3,SZ_f32),"v3.bin"); + FCK(WriteFile("v3_i16_signed.bin",h_i16_signed_v3,SZ_i16_signed),"v3_i16_signed.bin"); + FCK(WriteFile("v3_shift_boundary.bin",h_shift_boundary_v3,SZ_shift_boundary),"v3_shift_boundary.bin"); cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFree(v3Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - aclrtFreeHost(v3Host); - if (stream != nullptr) - aclrtDestroyStream(stream); - if (deviceSet) - aclrtResetDevice(deviceId); - if (aclInited) - aclFinalize(); + aclrtFree(d_f32_v1); + aclrtFree(d_f32_v2); + aclrtFree(d_f32_v3); + aclrtFree(d_i16_signed_v1); + aclrtFree(d_i16_signed_v2); + aclrtFree(d_i16_signed_v3); + aclrtFree(d_shift_boundary_v1); + aclrtFree(d_shift_boundary_v2); + aclrtFree(d_shift_boundary_v3); + aclrtFreeHost(h_f32_v1); + aclrtFreeHost(h_f32_v2); + aclrtFreeHost(h_f32_v3); + aclrtFreeHost(h_i16_signed_v1); + aclrtFreeHost(h_i16_signed_v2); + aclrtFreeHost(h_i16_signed_v3); + aclrtFreeHost(h_shift_boundary_v1); + aclrtFreeHost(h_shift_boundary_v2); + aclrtFreeHost(h_shift_boundary_v3); + if(stream) aclrtDestroyStream(stream); + if(deviceSet) aclrtResetDevice(deviceId); + if(aclInited) aclFinalize(); return rc; } diff --git a/test/vpto/cases/micro-op/binary-vector/vsub-tail/compare.py b/test/vpto/cases/micro-op/binary-vector/vsub-tail/compare.py deleted file mode 100644 index c95419953..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vsub-tail/compare.py +++ /dev/null @@ -1,39 +0,0 @@ -#!/usr/bin/python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - - -import os -import sys -import numpy as np - - -def compare_bin_prefix(golden_path, output_path, dtype, eps, count): - if not os.path.exists(golden_path) or not os.path.exists(output_path): - return False - golden = np.fromfile(golden_path, dtype=dtype, count=count) - output = np.fromfile(output_path, dtype=dtype, count=count) - return golden.shape == output.shape and np.allclose( - golden, output, atol=eps, rtol=eps, equal_nan=True - ) - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_bin_prefix("golden_v3.bin", "v3.bin", np.float32, 1e-4, 1000) - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/binary-vector/vsub-tail/golden.py b/test/vpto/cases/micro-op/binary-vector/vsub-tail/golden.py deleted file mode 100644 index 954e00c9b..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vsub-tail/golden.py +++ /dev/null @@ -1,51 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -ROWS = 32 -COLS = 32 -SEED = 19 -LOGICAL_ELEMS = 1000 -OUT_SENTINEL = np.float32(-123.25) - - -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) - v1 = rng.random((ROWS, COLS), dtype=np.float32) - v2 = rng.random((ROWS, COLS), dtype=np.float32) - v3 = np.full((ROWS, COLS), OUT_SENTINEL, dtype=np.float32) - golden_v3 = np.full((ROWS, COLS), OUT_SENTINEL, dtype=np.float32) - golden_v3.reshape(-1)[:LOGICAL_ELEMS] = ( - v1.reshape(-1)[:LOGICAL_ELEMS] - v2.reshape(-1)[:LOGICAL_ELEMS] - ).astype(np.float32, copy=False) - - output_dir.mkdir(parents=True, exist_ok=True) - v1.reshape(-1).tofile(output_dir / "v1.bin") - v2.reshape(-1).tofile(output_dir / "v2.bin") - v3.reshape(-1).tofile(output_dir / "v3.bin") - golden_v3.reshape(-1).tofile(output_dir / "golden_v3.bin") - - -def main() -> None: - parser = argparse.ArgumentParser() - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/binary-vector/vsub-tail/kernel.pto b/test/vpto/cases/micro-op/binary-vector/vsub-tail/kernel.pto deleted file mode 100644 index 6d839ec58..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vsub-tail/kernel.pto +++ /dev/null @@ -1,49 +0,0 @@ -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vsub_tail_kernel_2d(%arg0: !pto.ptr, %arg1: !pto.ptr, - %arg2: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c64 = arith.constant 64 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c128_i64 = arith.constant 128 : i64 - %c4096_i64 = arith.constant 4096 : i64 - %c8192_i64 = arith.constant 8192 : i64 - %c1000_i32 = arith.constant 1000 : i32 - - %ub_lhs = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_rhs = pto.castptr %c4096_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c8192_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_lhs, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - pto.mte_gm_ub %arg1, %ub_rhs, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %_:1 = scf.for %offset = %c0 to %c1024 step %c64 iter_args(%remaining = %c1000_i32) -> (i32) { - %mask, %next_remaining = pto.plt_b32 %remaining : i32 -> !pto.mask, i32 - %lhs = pto.vlds %ub_lhs[%offset] : !pto.ptr -> !pto.vreg<64xf32> - %rhs = pto.vlds %ub_rhs[%offset] : !pto.ptr -> !pto.vreg<64xf32> - %diff = pto.vsub %lhs, %rhs, %mask : !pto.vreg<64xf32>, !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> - pto.vsts %diff, %ub_out[%offset], %mask : !pto.vreg<64xf32>, !pto.ptr, !pto.mask - scf.yield %next_remaining : i32 - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg2, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/binary-vector/vsub-tail/launch.cpp b/test/vpto/cases/micro-op/binary-vector/vsub-tail/launch.cpp deleted file mode 100644 index 01f113a97..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vsub-tail/launch.cpp +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vsub_tail_kernel_2d(__gm__ float *v1, - __gm__ float *v2, - __gm__ float *v3); - -void LaunchVadd_tail_kernel_2d(float *v1, float *v2, float *v3, void *stream) { - vsub_tail_kernel_2d<<<1, nullptr, stream>>>((__gm__ float *)v1, - (__gm__ float *)v2, - (__gm__ float *)v3); -} diff --git a/test/vpto/cases/micro-op/binary-vector/vsub-tail/main.cpp b/test/vpto/cases/micro-op/binary-vector/vsub-tail/main.cpp deleted file mode 100644 index 40a9881d6..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vsub-tail/main.cpp +++ /dev/null @@ -1,94 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include - -using namespace PtoTestCommon; - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, \ - (int)_ret, __FILE__, __LINE__); \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVadd_tail_kernel_2d(float *v1, float *v2, float *v3, void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(float); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(float); - size_t elemCount_v3 = 1024; - size_t fileSize_v3 = elemCount_v3 * sizeof(float); - float *v1Host = nullptr; - float *v1Device = nullptr; - float *v2Host = nullptr; - float *v2Device = nullptr; - float *v3Host = nullptr; - float *v3Device = nullptr; - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) - deviceId = std::atoi(envDevice); - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMallocHost((void **)(&v3Host), fileSize_v3)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v3Device, fileSize_v3, ACL_MEM_MALLOC_HUGE_FIRST)); - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ReadFile("./v3.bin", fileSize_v3, v3Host, fileSize_v3); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v3Device, fileSize_v3, v3Host, fileSize_v3, - ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVadd_tail_kernel_2d(v1Device, v2Device, v3Device, stream); - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v3Host, fileSize_v3, v3Device, fileSize_v3, - ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v3.bin", v3Host, fileSize_v3); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFree(v3Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - aclrtFreeHost(v3Host); - if (stream != nullptr) - aclrtDestroyStream(stream); - if (deviceSet) - aclrtResetDevice(deviceId); - if (aclInited) - aclFinalize(); - return rc; -} diff --git a/test/vpto/cases/micro-op/binary-vector/vsub/compare.py b/test/vpto/cases/micro-op/binary-vector/vsub/compare.py deleted file mode 100644 index a5f14dabc..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vsub/compare.py +++ /dev/null @@ -1,37 +0,0 @@ -#!/usr/bin/python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - - -import os -import sys -import numpy as np - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(golden_path) or not os.path.exists(output_path): - return False - golden = np.fromfile(golden_path, dtype=dtype) - output = np.fromfile(output_path, dtype=dtype) - return golden.shape == output.shape and np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True) - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_bin("golden_v3.bin", "v3.bin", np.float32, 1e-4) - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/binary-vector/vsub/golden.py b/test/vpto/cases/micro-op/binary-vector/vsub/golden.py deleted file mode 100644 index 2f3f82fe6..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vsub/golden.py +++ /dev/null @@ -1,46 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -ROWS = 32 -COLS = 32 -SEED = 19 - - -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) - v1 = rng.uniform(-8.0, 8.0, size=(ROWS, COLS)).astype(np.float32) - v2 = rng.uniform(-8.0, 8.0, size=(ROWS, COLS)).astype(np.float32) - golden_v3 = (v1 - v2).astype(np.float32, copy=False) - v3 = np.zeros((ROWS, COLS), dtype=np.float32) - - output_dir.mkdir(parents=True, exist_ok=True) - v1.reshape(-1).tofile(output_dir / "v1.bin") - v2.reshape(-1).tofile(output_dir / "v2.bin") - v3.reshape(-1).tofile(output_dir / "v3.bin") - golden_v3.reshape(-1).tofile(output_dir / "golden_v3.bin") - - -def main() -> None: - parser = argparse.ArgumentParser() - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/binary-vector/vsub/kernel.pto b/test/vpto/cases/micro-op/binary-vector/vsub/kernel.pto deleted file mode 100644 index 2d3847fc0..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vsub/kernel.pto +++ /dev/null @@ -1,50 +0,0 @@ -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @sub_kernel_2d(%arg0: !pto.ptr, %arg1: !pto.ptr, - %arg2: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c64 = arith.constant 64 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c128_i64 = arith.constant 128 : i64 - %c4096_i64 = arith.constant 4096 : i64 - %c8192_i64 = arith.constant 8192 : i64 - %c1024_i32 = arith.constant 1024 : i32 - - %ub_lhs = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_rhs = pto.castptr %c4096_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c8192_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_lhs, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - pto.mte_gm_ub %arg1, %ub_rhs, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %_:1 = scf.for %offset = %c0 to %c1024 step %c64 iter_args(%remaining = %c1024_i32) -> (i32) { - %mask, %next_remaining = pto.plt_b32 %remaining : i32 -> !pto.mask, i32 - %lhs = pto.vlds %ub_lhs[%offset] : !pto.ptr -> !pto.vreg<64xf32> - %rhs = pto.vlds %ub_rhs[%offset] : !pto.ptr -> !pto.vreg<64xf32> - %diff = pto.vsub %lhs, %rhs, %mask : !pto.vreg<64xf32>, !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> - pto.vsts %diff, %ub_out[%offset], %mask : !pto.vreg<64xf32>, !pto.ptr, !pto.mask - scf.yield %next_remaining : i32 - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg2, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/binary-vector/vsub/launch.cpp b/test/vpto/cases/micro-op/binary-vector/vsub/launch.cpp deleted file mode 100644 index daeaeb5de..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vsub/launch.cpp +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void sub_kernel_2d(__gm__ float *v1, - __gm__ float *v2, - __gm__ float *v3); - -void LaunchSub_kernel_2d(float *v1, float *v2, float *v3, void *stream) { - sub_kernel_2d<<<1, nullptr, stream>>>((__gm__ float *)v1, - (__gm__ float *)v2, - (__gm__ float *)v3); -} diff --git a/test/vpto/cases/micro-op/binary-vector/vsub/main.cpp b/test/vpto/cases/micro-op/binary-vector/vsub/main.cpp deleted file mode 100644 index 0c7c8359a..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vsub/main.cpp +++ /dev/null @@ -1,94 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include - -using namespace PtoTestCommon; - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, \ - (int)_ret, __FILE__, __LINE__); \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchSub_kernel_2d(float *v1, float *v2, float *v3, void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(float); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(float); - size_t elemCount_v3 = 1024; - size_t fileSize_v3 = elemCount_v3 * sizeof(float); - float *v1Host = nullptr; - float *v1Device = nullptr; - float *v2Host = nullptr; - float *v2Device = nullptr; - float *v3Host = nullptr; - float *v3Device = nullptr; - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) - deviceId = std::atoi(envDevice); - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMallocHost((void **)(&v3Host), fileSize_v3)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v3Device, fileSize_v3, ACL_MEM_MALLOC_HUGE_FIRST)); - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ReadFile("./v3.bin", fileSize_v3, v3Host, fileSize_v3); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v3Device, fileSize_v3, v3Host, fileSize_v3, - ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchSub_kernel_2d(v1Device, v2Device, v3Device, stream); - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v3Host, fileSize_v3, v3Device, fileSize_v3, - ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v3.bin", v3Host, fileSize_v3); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFree(v3Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - aclrtFreeHost(v3Host); - if (stream != nullptr) - aclrtDestroyStream(stream); - if (deviceSet) - aclrtResetDevice(deviceId); - if (aclInited) - aclFinalize(); - return rc; -} diff --git a/test/vpto/cases/micro-op/binary-vector/vsubc-borrow-boundary/compare.py b/test/vpto/cases/micro-op/binary-vector/vsubc-borrow-boundary/compare.py deleted file mode 100755 index 67df8a750..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vsubc-borrow-boundary/compare.py +++ /dev/null @@ -1,65 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/binary-vector/vsubc-borrow-boundary -# family: binary-vector -# target_ops: pto.vsubc -# scenarios: core-u32-unsigned, full-mask, carry-chain -# NOTE: bulk-generated coverage skeleton. - -import os -import sys -import numpy as np - - -REPEAT_BYTES = 256 -LOGICAL_ELEMS = 64 -SRC_ELEM_BYTES = 4 - - -def _ceil_div(x, y): - return (x + y - 1) // y - - -def _packed_pred_storage_bytes(logical_elems, src_elem_bytes): - repeat_elems = REPEAT_BYTES // src_elem_bytes - repeat_times = _ceil_div(logical_elems, repeat_elems) + 1 - loop_count = repeat_times // 2 - return loop_count * 16 - - -def compare_result(): - golden = np.fromfile("golden_v3.bin", dtype=np.uint32, count=64) - output = np.fromfile("v3.bin", dtype=np.uint32, count=64) - return golden.shape == output.shape and np.array_equal(golden, output) - - -def compare_borrow(): - prefix_bytes = _packed_pred_storage_bytes(LOGICAL_ELEMS, SRC_ELEM_BYTES) - golden = np.fromfile("golden_v4.bin", dtype=np.uint8) - output = np.fromfile("v4.bin", dtype=np.uint8) - if golden.size < prefix_bytes or output.size < prefix_bytes: - return False - return np.array_equal(golden[:prefix_bytes], output[:prefix_bytes]) - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_result() and compare_borrow() - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/binary-vector/vsubc-borrow-boundary/golden.py b/test/vpto/cases/micro-op/binary-vector/vsubc-borrow-boundary/golden.py deleted file mode 100755 index cf6db0014..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vsubc-borrow-boundary/golden.py +++ /dev/null @@ -1,66 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/binary-vector/vsubc-borrow-boundary -# family: binary-vector -# target_ops: pto.vsubc -# scenarios: core-u32-unsigned, full-mask, carry-chain -# NOTE: bulk-generated coverage skeleton. -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -LANES = 64 -SEED = 19 - - -def generate(output_dir: Path, seed: int) -> None: - del seed - v1 = np.zeros(LANES, dtype=np.uint32) - v2 = np.zeros(LANES, dtype=np.uint32) - pattern_lhs = np.array([0x00000000, 0x00000001, 0x7FFFFFFF, 0x80000000], dtype=np.uint32) - pattern_rhs = np.array([0x00000001, 0x00000002, 0x80000000, 0xFFFFFFFF], dtype=np.uint32) - reps = LANES // pattern_lhs.size - v1[:] = np.tile(pattern_lhs, reps) - v2[:] = np.tile(pattern_rhs, reps) - no_borrow = v1 >= v2 - result = (v1 - v2).astype(np.uint32, copy=False) - packed = np.zeros(256, dtype=np.uint8) - for idx, bit in enumerate(no_borrow): - if not bit: - continue - byte = idx // 2 - if idx % 2 == 0: - packed[byte] |= np.uint8(0x1) - else: - packed[byte] |= np.uint8(0x10) - - output_dir.mkdir(parents=True, exist_ok=True) - v1.tofile(output_dir / "v1.bin") - v2.tofile(output_dir / "v2.bin") - np.zeros(LANES, dtype=np.uint32).tofile(output_dir / "v3.bin") - np.zeros(256, dtype=np.uint8).tofile(output_dir / "v4.bin") - result.tofile(output_dir / "golden_v3.bin") - packed.tofile(output_dir / "golden_v4.bin") - - -def main() -> None: - parser = argparse.ArgumentParser() - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/binary-vector/vsubc-borrow-boundary/kernel.pto b/test/vpto/cases/micro-op/binary-vector/vsubc-borrow-boundary/kernel.pto deleted file mode 100644 index 6c5586aa9..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vsubc-borrow-boundary/kernel.pto +++ /dev/null @@ -1,57 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/binary-vector/vsubc-borrow-boundary -// family: binary-vector -// target_ops: pto.vsubc -// scenarios: core-u32-unsigned, full-mask, carry-chain -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vsubc_borrow_boundary_kernel_2d(%arg0: !pto.ptr, %arg1: !pto.ptr, - %arg2: !pto.ptr, %arg3: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c128_i64 = arith.constant 128 : i64 - %c256_i64 = arith.constant 256 : i64 - %c4096_i64 = arith.constant 4096 : i64 - %c8192_i64 = arith.constant 8192 : i64 - %c12288_i64 = arith.constant 12288 : i64 - - %ub_lhs = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_rhs = pto.castptr %c4096_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c8192_i64 : i64 -> !pto.ptr - %ub_borrow = pto.castptr %c12288_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_lhs, %c0_i64, %c256_i64 - nburst(%c1_i64, %c256_i64, %c256_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - pto.mte_gm_ub %arg1, %ub_rhs, %c0_i64, %c256_i64 - nburst(%c1_i64, %c256_i64, %c256_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %mask = pto.pset_b32 "PAT_ALL" : !pto.mask - %lhs = pto.vlds %ub_lhs[%c0] : !pto.ptr -> !pto.vreg<64xui32> - %rhs = pto.vlds %ub_rhs[%c0] : !pto.ptr -> !pto.vreg<64xui32> - %diff, %borrow = pto.vsubc %lhs, %rhs, %mask : !pto.vreg<64xui32>, !pto.vreg<64xui32>, !pto.mask -> !pto.vreg<64xui32>, !pto.mask - pto.vsts %diff, %ub_out[%c0], %mask : !pto.vreg<64xui32>, !pto.ptr, !pto.mask - pto.psti %borrow, %ub_borrow[%c0], "NORM" : !pto.mask, !pto.ptr, index - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg2, %c256_i64 - nburst(%c1_i64, %c256_i64, %c256_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.mte_ub_gm %ub_borrow, %arg3, %c128_i64 - nburst(%c1_i64, %c256_i64, %c256_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/binary-vector/vsubc-borrow-boundary/launch.cpp b/test/vpto/cases/micro-op/binary-vector/vsubc-borrow-boundary/launch.cpp deleted file mode 100644 index 972a52f2d..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vsubc-borrow-boundary/launch.cpp +++ /dev/null @@ -1,56 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/binary-vector/vsubc-borrow-boundary -// family: binary-vector -// target_ops: pto.vsubc -// scenarios: core-u32-unsigned, full-mask, carry-chain -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void -vsubc_borrow_boundary_kernel_2d(__gm__ uint32_t *v1, __gm__ uint32_t *v2, - __gm__ uint32_t *v3, __gm__ uint8_t *v4); - -void LaunchVsubc_borrow_boundary_kernel_2d(uint32_t *v1, uint32_t *v2, - uint32_t *v3, uint8_t *v4, - void *stream) { - vsubc_borrow_boundary_kernel_2d<<<1, nullptr, stream>>>( - (__gm__ uint32_t *)v1, (__gm__ uint32_t *)v2, (__gm__ uint32_t *)v3, - (__gm__ uint8_t *)v4); -} diff --git a/test/vpto/cases/micro-op/binary-vector/vsubc-borrow-boundary/main.cpp b/test/vpto/cases/micro-op/binary-vector/vsubc-borrow-boundary/main.cpp deleted file mode 100644 index 43b2eb292..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vsubc-borrow-boundary/main.cpp +++ /dev/null @@ -1,119 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/binary-vector/vsubc-borrow-boundary -// family: binary-vector -// target_ops: pto.vsubc -// scenarios: core-u32-unsigned, full-mask, carry-chain -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include - -using namespace PtoTestCommon; - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, \ - (int)_ret, __FILE__, __LINE__); \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVsubc_borrow_boundary_kernel_2d(uint32_t *v1, uint32_t *v2, - uint32_t *v3, uint8_t *v4, - void *stream); - -int main() { - size_t elemCount_v1 = 64; - size_t fileSize_v1 = elemCount_v1 * sizeof(uint32_t); - size_t elemCount_v2 = 64; - size_t fileSize_v2 = elemCount_v2 * sizeof(uint32_t); - size_t elemCount_v3 = 64; - size_t fileSize_v3 = elemCount_v3 * sizeof(uint32_t); - size_t elemCount_v4 = 256; - size_t fileSize_v4 = elemCount_v4 * sizeof(uint8_t); - uint32_t *v1Host = nullptr; - uint32_t *v1Device = nullptr; - uint32_t *v2Host = nullptr; - uint32_t *v2Device = nullptr; - uint32_t *v3Host = nullptr; - uint32_t *v3Device = nullptr; - uint8_t *v4Host = nullptr; - uint8_t *v4Device = nullptr; - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) - deviceId = std::atoi(envDevice); - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMallocHost((void **)(&v3Host), fileSize_v3)); - ACL_CHECK(aclrtMallocHost((void **)(&v4Host), fileSize_v4)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v3Device, fileSize_v3, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v4Device, fileSize_v4, ACL_MEM_MALLOC_HUGE_FIRST)); - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ReadFile("./v3.bin", fileSize_v3, v3Host, fileSize_v3); - ReadFile("./v4.bin", fileSize_v4, v4Host, fileSize_v4); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v3Device, fileSize_v3, v3Host, fileSize_v3, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v4Device, fileSize_v4, v4Host, fileSize_v4, - ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVsubc_borrow_boundary_kernel_2d(v1Device, v2Device, v3Device, v4Device, - stream); - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v3Host, fileSize_v3, v3Device, fileSize_v3, - ACL_MEMCPY_DEVICE_TO_HOST)); - ACL_CHECK(aclrtMemcpy(v4Host, fileSize_v4, v4Device, fileSize_v4, - ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v3.bin", v3Host, fileSize_v3); - WriteFile("./v4.bin", v4Host, fileSize_v4); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFree(v3Device); - aclrtFree(v4Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - aclrtFreeHost(v3Host); - aclrtFreeHost(v4Host); - if (stream != nullptr) - aclrtDestroyStream(stream); - if (deviceSet) - aclrtResetDevice(deviceId); - if (aclInited) - aclFinalize(); - return rc; -} diff --git a/test/vpto/cases/micro-op/binary-vector/vsubc/compare.py b/test/vpto/cases/micro-op/binary-vector/vsubc/compare.py deleted file mode 100755 index f68c8267e..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vsubc/compare.py +++ /dev/null @@ -1,64 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/binary-vector/vsubc -# family: binary-vector -# target_ops: pto.vsubc -# scenarios: core-u32-unsigned, full-mask, carry-chain - -import os -import sys -import numpy as np - - -REPEAT_BYTES = 256 -LOGICAL_ELEMS = 64 -SRC_ELEM_BYTES = 4 - - -def _ceil_div(x, y): - return (x + y - 1) // y - - -def _packed_pred_storage_bytes(logical_elems, src_elem_bytes): - repeat_elems = REPEAT_BYTES // src_elem_bytes - repeat_times = _ceil_div(logical_elems, repeat_elems) + 1 - loop_count = repeat_times // 2 - return loop_count * 16 - - -def compare_result(): - golden = np.fromfile("golden_v3.bin", dtype=np.uint32, count=64) - output = np.fromfile("v3.bin", dtype=np.uint32, count=64) - return golden.shape == output.shape and np.array_equal(golden, output) - - -def compare_borrow(): - prefix_bytes = _packed_pred_storage_bytes(LOGICAL_ELEMS, SRC_ELEM_BYTES) - golden = np.fromfile("golden_v4.bin", dtype=np.uint8) - output = np.fromfile("v4.bin", dtype=np.uint8) - if golden.size < prefix_bytes or output.size < prefix_bytes: - return False - return np.array_equal(golden[:prefix_bytes], output[:prefix_bytes]) - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_result() and compare_borrow() - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/binary-vector/vsubc/golden.py b/test/vpto/cases/micro-op/binary-vector/vsubc/golden.py deleted file mode 100755 index 1b647ac6c..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vsubc/golden.py +++ /dev/null @@ -1,63 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/binary-vector/vsubc -# family: binary-vector -# target_ops: pto.vsubc -# scenarios: core-u32-unsigned, full-mask, carry-chain - -import argparse -from pathlib import Path - -import numpy as np - - -LANES = 64 -SEED = 19 - - -def pack_mask_nibbles(bits): - out = np.zeros(256, dtype=np.uint8) - for idx, bit in enumerate(bits): - if not bit: - continue - byte = idx // 2 - if idx % 2 == 0: - out[byte] |= np.uint8(0x1) - else: - out[byte] |= np.uint8(0x10) - return out - - -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) - v1 = rng.integers(0, 0xFFFFFFFF, size=LANES, dtype=np.uint32) - v2 = rng.integers(0, 0xFFFFFFFF, size=LANES, dtype=np.uint32) - diff = (v1 - v2).astype(np.uint32, copy=False) - no_borrow = v1 >= v2 - - output_dir.mkdir(parents=True, exist_ok=True) - v1.tofile(output_dir / "v1.bin") - v2.tofile(output_dir / "v2.bin") - np.zeros(LANES, dtype=np.uint32).tofile(output_dir / "v3.bin") - np.zeros(256, dtype=np.uint8).tofile(output_dir / "v4.bin") - diff.tofile(output_dir / "golden_v3.bin") - pack_mask_nibbles(no_borrow).tofile(output_dir / "golden_v4.bin") - - -def main() -> None: - parser = argparse.ArgumentParser() - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/binary-vector/vsubc/kernel.pto b/test/vpto/cases/micro-op/binary-vector/vsubc/kernel.pto deleted file mode 100644 index d9eedc19b..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vsubc/kernel.pto +++ /dev/null @@ -1,57 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/binary-vector/vsubc -// family: binary-vector -// target_ops: pto.vsubc -// scenarios: core-i16-unsigned, full-mask, carry-chain -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vsubc_kernel_2d(%arg0: !pto.ptr, %arg1: !pto.ptr, - %arg2: !pto.ptr, %arg3: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c128_i64 = arith.constant 128 : i64 - %c256_i64 = arith.constant 256 : i64 - %c4096_i64 = arith.constant 4096 : i64 - %c8192_i64 = arith.constant 8192 : i64 - %c12288_i64 = arith.constant 12288 : i64 - - %ub_lhs = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_rhs = pto.castptr %c4096_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c8192_i64 : i64 -> !pto.ptr - %ub_borrow = pto.castptr %c12288_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_lhs, %c0_i64, %c256_i64 - nburst(%c1_i64, %c256_i64, %c256_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - pto.mte_gm_ub %arg1, %ub_rhs, %c0_i64, %c256_i64 - nburst(%c1_i64, %c256_i64, %c256_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %mask = pto.pset_b32 "PAT_ALL" : !pto.mask - %lhs = pto.vlds %ub_lhs[%c0] : !pto.ptr -> !pto.vreg<64xui32> - %rhs = pto.vlds %ub_rhs[%c0] : !pto.ptr -> !pto.vreg<64xui32> - %diff, %borrow = pto.vsubc %lhs, %rhs, %mask : !pto.vreg<64xui32>, !pto.vreg<64xui32>, !pto.mask -> !pto.vreg<64xui32>, !pto.mask - pto.vsts %diff, %ub_out[%c0], %mask : !pto.vreg<64xui32>, !pto.ptr, !pto.mask - pto.psti %borrow, %ub_borrow[%c0], "NORM" : !pto.mask, !pto.ptr, index - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg2, %c256_i64 - nburst(%c1_i64, %c256_i64, %c256_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.mte_ub_gm %ub_borrow, %arg3, %c128_i64 - nburst(%c1_i64, %c256_i64, %c256_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/binary-vector/vsubc/launch.cpp b/test/vpto/cases/micro-op/binary-vector/vsubc/launch.cpp deleted file mode 100644 index 4f47cec25..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vsubc/launch.cpp +++ /dev/null @@ -1,57 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/binary-vector/vsubc -// family: binary-vector -// target_ops: pto.vsubc -// scenarios: core-u32-unsigned, full-mask, carry-chain -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vsubc_kernel_2d(__gm__ uint32_t *v1, - __gm__ uint32_t *v2, - __gm__ uint32_t *v3, - __gm__ uint8_t *v4); - -void LaunchVsubc_kernel_2d(uint32_t *v1, uint32_t *v2, uint32_t *v3, - uint8_t *v4, void *stream) { - vsubc_kernel_2d<<<1, nullptr, stream>>>((__gm__ uint32_t *)v1, - (__gm__ uint32_t *)v2, - (__gm__ uint32_t *)v3, - (__gm__ uint8_t *)v4); -} diff --git a/test/vpto/cases/micro-op/binary-vector/vsubc/main.cpp b/test/vpto/cases/micro-op/binary-vector/vsubc/main.cpp deleted file mode 100644 index a553603b2..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vsubc/main.cpp +++ /dev/null @@ -1,117 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/binary-vector/vsubc -// family: binary-vector -// target_ops: pto.vsubc -// scenarios: core-u32-unsigned, full-mask, carry-chain -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include - -using namespace PtoTestCommon; - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, \ - (int)_ret, __FILE__, __LINE__); \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVsubc_kernel_2d(uint32_t *v1, uint32_t *v2, uint32_t *v3, - uint8_t *v4, void *stream); - -int main() { - size_t elemCount_v1 = 64; - size_t fileSize_v1 = elemCount_v1 * sizeof(uint32_t); - size_t elemCount_v2 = 64; - size_t fileSize_v2 = elemCount_v2 * sizeof(uint32_t); - size_t elemCount_v3 = 64; - size_t fileSize_v3 = elemCount_v3 * sizeof(uint32_t); - size_t elemCount_v4 = 256; - size_t fileSize_v4 = elemCount_v4 * sizeof(uint8_t); - uint32_t *v1Host = nullptr; - uint32_t *v1Device = nullptr; - uint32_t *v2Host = nullptr; - uint32_t *v2Device = nullptr; - uint32_t *v3Host = nullptr; - uint32_t *v3Device = nullptr; - uint8_t *v4Host = nullptr; - uint8_t *v4Device = nullptr; - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) - deviceId = std::atoi(envDevice); - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMallocHost((void **)(&v3Host), fileSize_v3)); - ACL_CHECK(aclrtMallocHost((void **)(&v4Host), fileSize_v4)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v3Device, fileSize_v3, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v4Device, fileSize_v4, ACL_MEM_MALLOC_HUGE_FIRST)); - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ReadFile("./v3.bin", fileSize_v3, v3Host, fileSize_v3); - ReadFile("./v4.bin", fileSize_v4, v4Host, fileSize_v4); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v3Device, fileSize_v3, v3Host, fileSize_v3, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v4Device, fileSize_v4, v4Host, fileSize_v4, - ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVsubc_kernel_2d(v1Device, v2Device, v3Device, v4Device, stream); - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v3Host, fileSize_v3, v3Device, fileSize_v3, - ACL_MEMCPY_DEVICE_TO_HOST)); - ACL_CHECK(aclrtMemcpy(v4Host, fileSize_v4, v4Device, fileSize_v4, - ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v3.bin", v3Host, fileSize_v3); - WriteFile("./v4.bin", v4Host, fileSize_v4); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFree(v3Device); - aclrtFree(v4Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - aclrtFreeHost(v3Host); - aclrtFreeHost(v4Host); - if (stream != nullptr) - aclrtDestroyStream(stream); - if (deviceSet) - aclrtResetDevice(deviceId); - if (aclInited) - aclFinalize(); - return rc; -} diff --git a/test/vpto/cases/micro-op/binary-vector/vxor-mask-edge/compare.py b/test/vpto/cases/micro-op/binary-vector/vxor-mask-edge/compare.py deleted file mode 100755 index e8a187bb2..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vxor-mask-edge/compare.py +++ /dev/null @@ -1,44 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/binary-vector/vxor-mask-edge -# family: binary-vector -# target_ops: pto.vxor -# scenarios: core-i16-unsigned, full-mask -# NOTE: bulk-generated coverage skeleton. - -import os -import sys -import numpy as np - - -def compare_bin_prefix(golden_path, output_path, dtype, eps, count): - if not os.path.exists(golden_path) or not os.path.exists(output_path): - return False - golden = np.fromfile(golden_path, dtype=dtype, count=count) - output = np.fromfile(output_path, dtype=dtype, count=count) - return golden.shape == output.shape and np.allclose( - golden, output, atol=eps, rtol=eps, equal_nan=True - ) - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_bin_prefix("golden_v3.bin", "v3.bin", np.uint16, 0, 1024) - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/binary-vector/vxor-mask-edge/golden.py b/test/vpto/cases/micro-op/binary-vector/vxor-mask-edge/golden.py deleted file mode 100755 index 0da3cc44d..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vxor-mask-edge/golden.py +++ /dev/null @@ -1,50 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/binary-vector/vxor-mask-edge -# family: binary-vector -# target_ops: pto.vxor -# scenarios: core-i16-unsigned, full-mask -# NOTE: bulk-generated coverage skeleton. -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -ELEMS = 1024 -SEED = 19 - - -def generate(output_dir: Path, seed: int) -> None: - idx = np.arange(ELEMS, dtype=np.uint16) - v1 = np.where((idx & 1) == 0, np.uint16(0xAAAA), np.uint16(0x0F0F)).astype(np.uint16, copy=False) - v2 = np.where((idx & 2) == 0, np.uint16(0x5555), np.uint16(0x3333)).astype(np.uint16, copy=False) - v3 = np.zeros(ELEMS, dtype=np.uint16) - golden_v3 = np.bitwise_xor(v1, v2).astype(np.uint16, copy=False) - - output_dir.mkdir(parents=True, exist_ok=True) - v1.tofile(output_dir / "v1.bin") - v2.tofile(output_dir / "v2.bin") - v3.tofile(output_dir / "v3.bin") - golden_v3.tofile(output_dir / "golden_v3.bin") - - -def main() -> None: - parser = argparse.ArgumentParser() - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/binary-vector/vxor-mask-edge/kernel.pto b/test/vpto/cases/micro-op/binary-vector/vxor-mask-edge/kernel.pto deleted file mode 100644 index 4fbed53a0..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vxor-mask-edge/kernel.pto +++ /dev/null @@ -1,55 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/binary-vector/vxor-mask-edge -// family: binary-vector -// target_ops: pto.vxor -// scenarios: core-i16-unsigned, full-mask -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vxor_mask_edge_kernel(%arg0: !pto.ptr, %arg1: !pto.ptr, - %arg2: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c64_i64 = arith.constant 64 : i64 - %c1024 = arith.constant 1024 : index - %c128 = arith.constant 128 : index - %c2048_i64 = arith.constant 2048 : i64 - %c4096_i64 = arith.constant 4096 : i64 - - %ub_lhs = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_rhs = pto.castptr %c2048_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c4096_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_lhs, %c0_i64, %c64_i64 - nburst(%c32_i64, %c64_i64, %c64_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - pto.mte_gm_ub %arg1, %ub_rhs, %c0_i64, %c64_i64 - nburst(%c32_i64, %c64_i64, %c64_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %mask = pto.pset_b16 "PAT_ALL" : !pto.mask - scf.for %offset = %c0 to %c1024 step %c128 { - %lhs = pto.vlds %ub_lhs[%offset] : !pto.ptr -> !pto.vreg<128xui16> - %rhs = pto.vlds %ub_rhs[%offset] : !pto.ptr -> !pto.vreg<128xui16> - %out = pto.vxor %lhs, %rhs, %mask : !pto.vreg<128xui16>, !pto.vreg<128xui16>, !pto.mask -> !pto.vreg<128xui16> - pto.vsts %out, %ub_out[%offset], %mask : !pto.vreg<128xui16>, !pto.ptr, !pto.mask - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg2, %c64_i64 - nburst(%c32_i64, %c64_i64, %c64_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/binary-vector/vxor-mask-edge/launch.cpp b/test/vpto/cases/micro-op/binary-vector/vxor-mask-edge/launch.cpp deleted file mode 100644 index 309646298..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vxor-mask-edge/launch.cpp +++ /dev/null @@ -1,55 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/binary-vector/vxor-mask-edge -// family: binary-vector -// target_ops: pto.vxor -// scenarios: core-i16-unsigned, full-mask -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vxor_mask_edge_kernel(__gm__ uint16_t *v1, - __gm__ uint16_t *v2, - __gm__ uint16_t *v3); - -void LaunchVxor_mask_edge_kernel(uint16_t *v1, uint16_t *v2, uint16_t *v3, - void *stream) { - vxor_mask_edge_kernel<<<1, nullptr, stream>>>((__gm__ uint16_t *)v1, - (__gm__ uint16_t *)v2, - (__gm__ uint16_t *)v3); -} diff --git a/test/vpto/cases/micro-op/binary-vector/vxor-mask-edge/main.cpp b/test/vpto/cases/micro-op/binary-vector/vxor-mask-edge/main.cpp deleted file mode 100644 index 95d478f5f..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vxor-mask-edge/main.cpp +++ /dev/null @@ -1,103 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/binary-vector/vxor-mask-edge -// family: binary-vector -// target_ops: pto.vxor -// scenarios: core-i16-unsigned, full-mask -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include - -using namespace PtoTestCommon; - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, \ - (int)_ret, __FILE__, __LINE__); \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVxor_mask_edge_kernel(uint16_t *v1, uint16_t *v2, uint16_t *v3, - void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(uint16_t); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(uint16_t); - size_t elemCount_v3 = 1024; - size_t fileSize_v3 = elemCount_v3 * sizeof(uint16_t); - uint16_t *v1Host = nullptr; - uint16_t *v1Device = nullptr; - uint16_t *v2Host = nullptr; - uint16_t *v2Device = nullptr; - uint16_t *v3Host = nullptr; - uint16_t *v3Device = nullptr; - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) - deviceId = std::atoi(envDevice); - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMallocHost((void **)(&v3Host), fileSize_v3)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v3Device, fileSize_v3, ACL_MEM_MALLOC_HUGE_FIRST)); - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ReadFile("./v3.bin", fileSize_v3, v3Host, fileSize_v3); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v3Device, fileSize_v3, v3Host, fileSize_v3, - ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVxor_mask_edge_kernel(v1Device, v2Device, v3Device, stream); - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v3Host, fileSize_v3, v3Device, fileSize_v3, - ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v3.bin", v3Host, fileSize_v3); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFree(v3Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - aclrtFreeHost(v3Host); - if (stream != nullptr) - aclrtDestroyStream(stream); - if (deviceSet) - aclrtResetDevice(deviceId); - if (aclInited) - aclFinalize(); - return rc; -} diff --git a/test/vpto/cases/micro-op/binary-vector/vxor/compare.py b/test/vpto/cases/micro-op/binary-vector/vxor/compare.py deleted file mode 100755 index cd8e1ce3e..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vxor/compare.py +++ /dev/null @@ -1,44 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/binary-vector/vxor -# family: binary-vector -# target_ops: pto.vxor -# scenarios: core-i16-unsigned, full-mask -# NOTE: bulk-generated coverage skeleton. - -import os -import sys -import numpy as np - - -def compare_bin_prefix(golden_path, output_path, dtype, eps, count): - if not os.path.exists(golden_path) or not os.path.exists(output_path): - return False - golden = np.fromfile(golden_path, dtype=dtype, count=count) - output = np.fromfile(output_path, dtype=dtype, count=count) - return golden.shape == output.shape and np.allclose( - golden, output, atol=eps, rtol=eps, equal_nan=True - ) - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_bin_prefix("golden_v3.bin", "v3.bin", np.uint16, 0, 1024) - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/binary-vector/vxor/golden.py b/test/vpto/cases/micro-op/binary-vector/vxor/golden.py deleted file mode 100755 index f5e328e08..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vxor/golden.py +++ /dev/null @@ -1,50 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/binary-vector/vxor -# family: binary-vector -# target_ops: pto.vxor -# scenarios: core-i16-unsigned, full-mask -# NOTE: bulk-generated coverage skeleton. -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -ELEMS = 1024 -SEED = 19 - - -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) - v1 = rng.integers(0, 0x10000, size=ELEMS, dtype=np.uint16) - v2 = rng.integers(0, 0x10000, size=ELEMS, dtype=np.uint16) - v3 = np.zeros(ELEMS, dtype=np.uint16) - golden_v3 = np.bitwise_xor(v1, v2).astype(np.uint16, copy=False) - - output_dir.mkdir(parents=True, exist_ok=True) - v1.tofile(output_dir / "v1.bin") - v2.tofile(output_dir / "v2.bin") - v3.tofile(output_dir / "v3.bin") - golden_v3.tofile(output_dir / "golden_v3.bin") - - -def main() -> None: - parser = argparse.ArgumentParser() - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/binary-vector/vxor/kernel.pto b/test/vpto/cases/micro-op/binary-vector/vxor/kernel.pto deleted file mode 100644 index 6cd29199d..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vxor/kernel.pto +++ /dev/null @@ -1,55 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/binary-vector/vxor -// family: binary-vector -// target_ops: pto.vxor -// scenarios: core-i16-unsigned, full-mask -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vxor_i16_unsigned_kernel(%arg0: !pto.ptr, %arg1: !pto.ptr, - %arg2: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1024 = arith.constant 1024 : index - %c128 = arith.constant 128 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c64_i64 = arith.constant 64 : i64 - %c2048_i64 = arith.constant 2048 : i64 - %c4096_i64 = arith.constant 4096 : i64 - - %ub_lhs = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_rhs = pto.castptr %c2048_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c4096_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_lhs, %c0_i64, %c64_i64 - nburst(%c32_i64, %c64_i64, %c64_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - pto.mte_gm_ub %arg1, %ub_rhs, %c0_i64, %c64_i64 - nburst(%c32_i64, %c64_i64, %c64_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %mask = pto.pset_b16 "PAT_ALL" : !pto.mask - scf.for %offset = %c0 to %c1024 step %c128 { - %lhs = pto.vlds %ub_lhs[%offset] : !pto.ptr -> !pto.vreg<128xui16> - %rhs = pto.vlds %ub_rhs[%offset] : !pto.ptr -> !pto.vreg<128xui16> - %out = pto.vxor %lhs, %rhs, %mask : !pto.vreg<128xui16>, !pto.vreg<128xui16>, !pto.mask -> !pto.vreg<128xui16> - pto.vsts %out, %ub_out[%offset], %mask : !pto.vreg<128xui16>, !pto.ptr, !pto.mask - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg2, %c64_i64 - nburst(%c32_i64, %c64_i64, %c64_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/binary-vector/vxor/launch.cpp b/test/vpto/cases/micro-op/binary-vector/vxor/launch.cpp deleted file mode 100644 index 59d1c049b..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vxor/launch.cpp +++ /dev/null @@ -1,55 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/binary-vector/vxor -// family: binary-vector -// target_ops: pto.vxor -// scenarios: core-i16-unsigned, full-mask -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vxor_i16_unsigned_kernel(__gm__ uint16_t *v1, - __gm__ uint16_t *v2, - __gm__ uint16_t *v3); - -void LaunchVxor_i16_unsigned_kernel(uint16_t *v1, uint16_t *v2, uint16_t *v3, - void *stream) { - vxor_i16_unsigned_kernel<<<1, nullptr, stream>>>((__gm__ uint16_t *)v1, - (__gm__ uint16_t *)v2, - (__gm__ uint16_t *)v3); -} diff --git a/test/vpto/cases/micro-op/binary-vector/vxor/main.cpp b/test/vpto/cases/micro-op/binary-vector/vxor/main.cpp deleted file mode 100644 index 99f4a9d98..000000000 --- a/test/vpto/cases/micro-op/binary-vector/vxor/main.cpp +++ /dev/null @@ -1,103 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/binary-vector/vxor -// family: binary-vector -// target_ops: pto.vxor -// scenarios: core-i16-unsigned, full-mask -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include - -using namespace PtoTestCommon; - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, \ - (int)_ret, __FILE__, __LINE__); \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVxor_i16_unsigned_kernel(uint16_t *v1, uint16_t *v2, uint16_t *v3, - void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(uint16_t); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(uint16_t); - size_t elemCount_v3 = 1024; - size_t fileSize_v3 = elemCount_v3 * sizeof(uint16_t); - uint16_t *v1Host = nullptr; - uint16_t *v1Device = nullptr; - uint16_t *v2Host = nullptr; - uint16_t *v2Device = nullptr; - uint16_t *v3Host = nullptr; - uint16_t *v3Device = nullptr; - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) - deviceId = std::atoi(envDevice); - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMallocHost((void **)(&v3Host), fileSize_v3)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v3Device, fileSize_v3, ACL_MEM_MALLOC_HUGE_FIRST)); - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ReadFile("./v3.bin", fileSize_v3, v3Host, fileSize_v3); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v3Device, fileSize_v3, v3Host, fileSize_v3, - ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVxor_i16_unsigned_kernel(v1Device, v2Device, v3Device, stream); - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v3Host, fileSize_v3, v3Device, fileSize_v3, - ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v3.bin", v3Host, fileSize_v3); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFree(v3Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - aclrtFreeHost(v3Host); - if (stream != nullptr) - aclrtDestroyStream(stream); - if (deviceSet) - aclrtResetDevice(deviceId); - if (aclInited) - aclFinalize(); - return rc; -} diff --git a/test/vpto/cases/micro-op/compare-select/vcmp-eq/kernel.pto b/test/vpto/cases/micro-op/compare-select/vcmp-eq/kernel.pto index b92a55f8d..9a4b18899 100644 --- a/test/vpto/cases/micro-op/compare-select/vcmp-eq/kernel.pto +++ b/test/vpto/cases/micro-op/compare-select/vcmp-eq/kernel.pto @@ -46,6 +46,157 @@ module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind, !pto.ptr, i64, i64, i64, i64 pto.barrier #pto.pipe + %__case_merge_guard = arith.constant false + + // inactive cross-case merge from micro-op/compare-select/vcmp-f32-exceptional + scf.if %__case_merge_guard { + + %c0_cmg3_1 = arith.constant 0 : index + %c1_cmg3_1 = arith.constant 1 : index + %c0_i64_cmg3_1 = arith.constant 0 : i64 + %c1_i64_cmg3_1 = arith.constant 1 : i64 + %c32_i64_cmg3_1 = arith.constant 32 : i64 + %c64_i32_cmg3_1 = arith.constant 64 : i32 + %c64_i64_cmg3_1 = arith.constant 64 : i64 + %c256_i64_cmg3_1 = arith.constant 256 : i64 + %c512_i64_cmg3_1 = arith.constant 512 : i64 + %false_cmg3_1 = arith.constant false + + %ub_lhs_cmg3_1 = pto.castptr %c0_i64_cmg3_1 : i64 -> !pto.ptr + %ub_rhs_cmg3_1 = pto.castptr %c256_i64_cmg3_1 : i64 -> !pto.ptr + %ub_out_cmg3_1 = pto.castptr %c512_i64_cmg3_1 : i64 -> !pto.ptr + pto.set_loop1_stride_outtoub %c256_i64_cmg3_1, %c256_i64_cmg3_1 : i64, i64 + pto.set_loop2_stride_outtoub %c256_i64_cmg3_1, %c256_i64_cmg3_1 : i64, i64 + pto.mte_gm_ub %arg0, %ub_lhs_cmg3_1, %c0_i64_cmg3_1, %c256_i64_cmg3_1 + nburst(%c64_i64_cmg3_1, %c256_i64_cmg3_1, %c256_i64_cmg3_1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + pto.mte_gm_ub %arg1, %ub_rhs_cmg3_1, %c0_i64_cmg3_1, %c256_i64_cmg3_1 + nburst(%c64_i64_cmg3_1, %c256_i64_cmg3_1, %c256_i64_cmg3_1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %__cmg3_1:1 = scf.for %iv_cmg3_1 = %c0_cmg3_1 to %c1_cmg3_1 step %c1_cmg3_1 iter_args(%remaining_cmg3_1 = %c64_i32_cmg3_1) -> (i32) { + %active_cmg3_1, %next_cmg3_1 = pto.plt_b32 %remaining_cmg3_1 : i32 -> !pto.mask, i32 + %lhs_cmg3_1 = pto.vlds %ub_lhs_cmg3_1[%c0_cmg3_1] : !pto.ptr -> !pto.vreg<64xf32> + %rhs_cmg3_1 = pto.vlds %ub_rhs_cmg3_1[%c0_cmg3_1] : !pto.ptr -> !pto.vreg<64xf32> + %pred_cmg3_1 = pto.vcmp %lhs_cmg3_1, %rhs_cmg3_1, %active_cmg3_1, "lt" : !pto.vreg<64xf32>, !pto.vreg<64xf32>, !pto.mask -> !pto.mask + pto.psts %pred_cmg3_1, %ub_out_cmg3_1[%c0_cmg3_1], "NORM" : !pto.mask, !pto.ptr, index + scf.yield %next_cmg3_1 : i32 + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.set_loop1_stride_ubtoout %c32_i64_cmg3_1, %c32_i64_cmg3_1 : i64, i64 + pto.set_loop2_stride_ubtoout %c32_i64_cmg3_1, %c32_i64_cmg3_1 : i64, i64 + pto.mte_ub_gm %ub_out_cmg3_1, %arg2, %c32_i64_cmg3_1 + nburst(%c32_i64_cmg3_1, %c32_i64_cmg3_1, %c32_i64_cmg3_1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + } + + // inactive cross-case merge from micro-op/compare-select/vcmp-lt + scf.if %__case_merge_guard { + + %c0_cmg3_2 = arith.constant 0 : index + %c1_cmg3_2 = arith.constant 1 : index + %c0_i64_cmg3_2 = arith.constant 0 : i64 + %c1_i64_cmg3_2 = arith.constant 1 : i64 + %c32_i64_cmg3_2 = arith.constant 32 : i64 + %c64_i32_cmg3_2 = arith.constant 64 : i32 + %c64_i64_cmg3_2 = arith.constant 64 : i64 + %c256_i64_cmg3_2 = arith.constant 256 : i64 + %c512_i64_cmg3_2 = arith.constant 512 : i64 + %false_cmg3_2 = arith.constant false + + %ub_lhs_cmg3_2 = pto.castptr %c0_i64_cmg3_2 : i64 -> !pto.ptr + %ub_rhs_cmg3_2 = pto.castptr %c256_i64_cmg3_2 : i64 -> !pto.ptr + %ub_out_cmg3_2 = pto.castptr %c512_i64_cmg3_2 : i64 -> !pto.ptr + pto.set_loop1_stride_outtoub %c256_i64_cmg3_2, %c256_i64_cmg3_2 : i64, i64 + pto.set_loop2_stride_outtoub %c256_i64_cmg3_2, %c256_i64_cmg3_2 : i64, i64 + pto.mte_gm_ub %arg0, %ub_lhs_cmg3_2, %c0_i64_cmg3_2, %c256_i64_cmg3_2 + nburst(%c64_i64_cmg3_2, %c256_i64_cmg3_2, %c256_i64_cmg3_2) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + pto.mte_gm_ub %arg1, %ub_rhs_cmg3_2, %c0_i64_cmg3_2, %c256_i64_cmg3_2 + nburst(%c64_i64_cmg3_2, %c256_i64_cmg3_2, %c256_i64_cmg3_2) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %__cmg3_2:1 = scf.for %iv_cmg3_2 = %c0_cmg3_2 to %c1_cmg3_2 step %c1_cmg3_2 iter_args(%remaining_cmg3_2 = %c64_i32_cmg3_2) -> (i32) { + %active_cmg3_2, %next_cmg3_2 = pto.plt_b32 %remaining_cmg3_2 : i32 -> !pto.mask, i32 + %lhs_cmg3_2 = pto.vlds %ub_lhs_cmg3_2[%c0_cmg3_2] : !pto.ptr -> !pto.vreg<64xf32> + %rhs_cmg3_2 = pto.vlds %ub_rhs_cmg3_2[%c0_cmg3_2] : !pto.ptr -> !pto.vreg<64xf32> + %pred_cmg3_2 = pto.vcmp %lhs_cmg3_2, %rhs_cmg3_2, %active_cmg3_2, "lt" : !pto.vreg<64xf32>, !pto.vreg<64xf32>, !pto.mask -> !pto.mask + pto.psts %pred_cmg3_2, %ub_out_cmg3_2[%c0_cmg3_2], "NORM" : !pto.mask, !pto.ptr, index + scf.yield %next_cmg3_2 : i32 + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.set_loop1_stride_ubtoout %c32_i64_cmg3_2, %c32_i64_cmg3_2 : i64, i64 + pto.set_loop2_stride_ubtoout %c32_i64_cmg3_2, %c32_i64_cmg3_2 : i64, i64 + pto.mte_ub_gm %ub_out_cmg3_2, %arg2, %c32_i64_cmg3_2 + nburst(%c32_i64_cmg3_2, %c32_i64_cmg3_2, %c32_i64_cmg3_2) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + } + + // inactive cross-case merge from micro-op/compare-select/vcmp-tail + scf.if %__case_merge_guard { + + %c0_cmg3_3 = arith.constant 0 : index + %c1_cmg3_3 = arith.constant 1 : index + %c0_i64_cmg3_3 = arith.constant 0 : i64 + %c1_i64_cmg3_3 = arith.constant 1 : i64 + %c32_i64_cmg3_3 = arith.constant 32 : i64 + %c53_i32_cmg3_3 = arith.constant 53 : i32 + %c64_i64_cmg3_3 = arith.constant 64 : i64 + %c256_i64_cmg3_3 = arith.constant 256 : i64 + %c512_i64_cmg3_3 = arith.constant 512 : i64 + %false_cmg3_3 = arith.constant false + + %ub_lhs_cmg3_3 = pto.castptr %c0_i64_cmg3_3 : i64 -> !pto.ptr + %ub_rhs_cmg3_3 = pto.castptr %c256_i64_cmg3_3 : i64 -> !pto.ptr + %ub_out_cmg3_3 = pto.castptr %c512_i64_cmg3_3 : i64 -> !pto.ptr + pto.set_loop1_stride_outtoub %c256_i64_cmg3_3, %c256_i64_cmg3_3 : i64, i64 + pto.set_loop2_stride_outtoub %c256_i64_cmg3_3, %c256_i64_cmg3_3 : i64, i64 + pto.mte_gm_ub %arg0, %ub_lhs_cmg3_3, %c0_i64_cmg3_3, %c256_i64_cmg3_3 + nburst(%c64_i64_cmg3_3, %c256_i64_cmg3_3, %c256_i64_cmg3_3) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + pto.mte_gm_ub %arg1, %ub_rhs_cmg3_3, %c0_i64_cmg3_3, %c256_i64_cmg3_3 + nburst(%c64_i64_cmg3_3, %c256_i64_cmg3_3, %c256_i64_cmg3_3) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %__cmg3_3:1 = scf.for %iv_cmg3_3 = %c0_cmg3_3 to %c1_cmg3_3 step %c1_cmg3_3 iter_args(%remaining_cmg3_3 = %c53_i32_cmg3_3) -> (i32) { + %active_cmg3_3, %next_cmg3_3 = pto.plt_b32 %remaining_cmg3_3 : i32 -> !pto.mask, i32 + %lhs_cmg3_3 = pto.vlds %ub_lhs_cmg3_3[%c0_cmg3_3] : !pto.ptr -> !pto.vreg<64xf32> + %rhs_cmg3_3 = pto.vlds %ub_rhs_cmg3_3[%c0_cmg3_3] : !pto.ptr -> !pto.vreg<64xf32> + %pred_cmg3_3 = pto.vcmp %lhs_cmg3_3, %rhs_cmg3_3, %active_cmg3_3, "lt" : !pto.vreg<64xf32>, !pto.vreg<64xf32>, !pto.mask -> !pto.mask + pto.psts %pred_cmg3_3, %ub_out_cmg3_3[%c0_cmg3_3], "NORM" : !pto.mask, !pto.ptr, index + scf.yield %next_cmg3_3 : i32 + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.set_loop1_stride_ubtoout %c32_i64_cmg3_3, %c32_i64_cmg3_3 : i64, i64 + pto.set_loop2_stride_ubtoout %c32_i64_cmg3_3, %c32_i64_cmg3_3 : i64, i64 + pto.mte_ub_gm %ub_out_cmg3_3, %arg2, %c32_i64_cmg3_3 + nburst(%c32_i64_cmg3_3, %c32_i64_cmg3_3, %c32_i64_cmg3_3) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + } return } } diff --git a/test/vpto/cases/micro-op/compare-select/vcmp-f32-exceptional/compare.py b/test/vpto/cases/micro-op/compare-select/vcmp-f32-exceptional/compare.py deleted file mode 100644 index a872552e3..000000000 --- a/test/vpto/cases/micro-op/compare-select/vcmp-f32-exceptional/compare.py +++ /dev/null @@ -1,35 +0,0 @@ -#!/usr/bin/python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - - -import os -import sys -import numpy as np - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - golden = np.fromfile("golden_v3.bin", dtype=np.uint8) - output = np.fromfile("v3.bin", dtype=np.uint8) - ok = golden.size >= 32 and output.size >= 32 and np.array_equal(golden[:32], output[:32]) - if not ok: - if golden.size and output.size: - diff = np.nonzero(golden[:32] != output[:32])[0] - idx = int(diff[0]) if diff.size else 0 - print(f"[ERROR] Mismatch: idx={idx} golden={int(golden[idx])} out={int(output[idx])}") - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/compare-select/vcmp-f32-exceptional/golden.py b/test/vpto/cases/micro-op/compare-select/vcmp-f32-exceptional/golden.py deleted file mode 100644 index 3d3dbb4d8..000000000 --- a/test/vpto/cases/micro-op/compare-select/vcmp-f32-exceptional/golden.py +++ /dev/null @@ -1,62 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - - -import argparse -from pathlib import Path - -import numpy as np - - -LANES = 64 -SEED = 19 -OUTPUT_BYTES = 32 - - -def encode_b32_mask(mask: np.ndarray) -> np.ndarray: - out = np.zeros((OUTPUT_BYTES,), dtype=np.uint8) - for i, bit in enumerate(mask.astype(np.uint8, copy=False)): - if bit: - byte_index = i // 2 - nibble_shift = 4 * (i % 2) - out[byte_index] |= np.uint8(1 << nibble_shift) - return out - - -def generate(output_dir: Path, seed: int) -> None: - del seed - lhs = np.array( - [-np.inf, -3.0, -0.0, 0.0, 0.5, np.inf, np.nan, 1.0], - dtype=np.float32, - ) - rhs = np.array( - [np.inf, -2.0, 0.0, -0.0, 0.5, np.nan, 1.0, -np.inf], - dtype=np.float32, - ) - v1 = np.resize(lhs, LANES).astype(np.float32) - v2 = np.resize(rhs, LANES).astype(np.float32) - mask = np.less(v1, v2) - - output_dir.mkdir(parents=True, exist_ok=True) - v1.tofile(output_dir / "v1.bin") - v2.tofile(output_dir / "v2.bin") - np.zeros((OUTPUT_BYTES,), dtype=np.uint8).tofile(output_dir / "v3.bin") - encode_b32_mask(mask).tofile(output_dir / "golden_v3.bin") - - -def main() -> None: - parser = argparse.ArgumentParser(description="Generate inputs/golden for VPTO vcmp-f32-exceptional.") - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/compare-select/vcmp-f32-exceptional/kernel.pto b/test/vpto/cases/micro-op/compare-select/vcmp-f32-exceptional/kernel.pto deleted file mode 100644 index 6f31f7cb8..000000000 --- a/test/vpto/cases/micro-op/compare-select/vcmp-f32-exceptional/kernel.pto +++ /dev/null @@ -1,51 +0,0 @@ -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vcmp_f32_exceptional_kernel_2d(%arg0: !pto.ptr, %arg1: !pto.ptr, - %arg2: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c64_i32 = arith.constant 64 : i32 - %c64_i64 = arith.constant 64 : i64 - %c256_i64 = arith.constant 256 : i64 - %c512_i64 = arith.constant 512 : i64 - %false = arith.constant false - - %ub_lhs = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_rhs = pto.castptr %c256_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c512_i64 : i64 -> !pto.ptr - pto.set_loop1_stride_outtoub %c256_i64, %c256_i64 : i64, i64 - pto.set_loop2_stride_outtoub %c256_i64, %c256_i64 : i64, i64 - pto.mte_gm_ub %arg0, %ub_lhs, %c0_i64, %c256_i64 - nburst(%c64_i64, %c256_i64, %c256_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - pto.mte_gm_ub %arg1, %ub_rhs, %c0_i64, %c256_i64 - nburst(%c64_i64, %c256_i64, %c256_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %_:1 = scf.for %iv = %c0 to %c1 step %c1 iter_args(%remaining = %c64_i32) -> (i32) { - %active, %next = pto.plt_b32 %remaining : i32 -> !pto.mask, i32 - %lhs = pto.vlds %ub_lhs[%c0] : !pto.ptr -> !pto.vreg<64xf32> - %rhs = pto.vlds %ub_rhs[%c0] : !pto.ptr -> !pto.vreg<64xf32> - %pred = pto.vcmp %lhs, %rhs, %active, "lt" : !pto.vreg<64xf32>, !pto.vreg<64xf32>, !pto.mask -> !pto.mask - pto.psts %pred, %ub_out[%c0], "NORM" : !pto.mask, !pto.ptr, index - scf.yield %next : i32 - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.set_loop1_stride_ubtoout %c32_i64, %c32_i64 : i64, i64 - pto.set_loop2_stride_ubtoout %c32_i64, %c32_i64 : i64, i64 - pto.mte_ub_gm %ub_out, %arg2, %c32_i64 - nburst(%c32_i64, %c32_i64, %c32_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/compare-select/vcmp-f32-exceptional/launch.cpp b/test/vpto/cases/micro-op/compare-select/vcmp-f32-exceptional/launch.cpp deleted file mode 100644 index 97d79d2fd..000000000 --- a/test/vpto/cases/micro-op/compare-select/vcmp-f32-exceptional/launch.cpp +++ /dev/null @@ -1,50 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// --------------------------------------------------------------------------- -// PTOAS compatibility layer -// --------------------------------------------------------------------------- -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include - -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vcmp_f32_exceptional_kernel_2d(__gm__ float *v1, - __gm__ float *v2, - __gm__ unsigned char *v3); - -void LaunchVcmp_f32_exceptional_kernel_2d(float *v1, float *v2, - unsigned char *v3, void *stream) { - vcmp_f32_exceptional_kernel_2d<<<1, nullptr, stream>>>( - (__gm__ float *)v1, (__gm__ float *)v2, (__gm__ unsigned char *)v3); -} diff --git a/test/vpto/cases/micro-op/compare-select/vcmp-f32-exceptional/main.cpp b/test/vpto/cases/micro-op/compare-select/vcmp-f32-exceptional/main.cpp deleted file mode 100644 index b8b92375f..000000000 --- a/test/vpto/cases/micro-op/compare-select/vcmp-f32-exceptional/main.cpp +++ /dev/null @@ -1,103 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include - -using namespace PtoTestCommon; - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, \ - (int)_ret, __FILE__, __LINE__); \ - const char *_recent = aclGetRecentErrMsg(); \ - if (_recent != nullptr && _recent[0] != '\0') \ - std::fprintf(stderr, "[ERROR] RecentErrMsg: %s\n", _recent); \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVcmp_f32_exceptional_kernel_2d(float *v1, float *v2, - unsigned char *v3, void *stream); - -int main() { - size_t elemCount_v1 = 64; - size_t fileSize_v1 = elemCount_v1 * sizeof(float); - size_t elemCount_v2 = 64; - size_t fileSize_v2 = elemCount_v2 * sizeof(float); - size_t elemCount_v3 = 32; - size_t fileSize_v3 = elemCount_v3 * sizeof(unsigned char); - float *v1Host = nullptr; - float *v1Device = nullptr; - float *v2Host = nullptr; - float *v2Device = nullptr; - unsigned char *v3Host = nullptr; - unsigned char *v3Device = nullptr; - - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) - deviceId = std::atoi(envDevice); - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMallocHost((void **)(&v3Host), fileSize_v3)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v3Device, fileSize_v3, ACL_MEM_MALLOC_HUGE_FIRST)); - - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ReadFile("./v3.bin", fileSize_v3, v3Host, fileSize_v3); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v3Device, fileSize_v3, v3Host, fileSize_v3, - ACL_MEMCPY_HOST_TO_DEVICE)); - - LaunchVcmp_f32_exceptional_kernel_2d(v1Device, v2Device, v3Device, stream); - - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v3Host, fileSize_v3, v3Device, fileSize_v3, - ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v3.bin", v3Host, fileSize_v3); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFree(v3Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - aclrtFreeHost(v3Host); - if (stream != nullptr) - aclrtDestroyStream(stream); - if (deviceSet) - aclrtResetDevice(deviceId); - if (aclInited) - aclFinalize(); - return rc; -} diff --git a/test/vpto/cases/micro-op/compare-select/vcmp-lt/compare.py b/test/vpto/cases/micro-op/compare-select/vcmp-lt/compare.py deleted file mode 100644 index a872552e3..000000000 --- a/test/vpto/cases/micro-op/compare-select/vcmp-lt/compare.py +++ /dev/null @@ -1,35 +0,0 @@ -#!/usr/bin/python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - - -import os -import sys -import numpy as np - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - golden = np.fromfile("golden_v3.bin", dtype=np.uint8) - output = np.fromfile("v3.bin", dtype=np.uint8) - ok = golden.size >= 32 and output.size >= 32 and np.array_equal(golden[:32], output[:32]) - if not ok: - if golden.size and output.size: - diff = np.nonzero(golden[:32] != output[:32])[0] - idx = int(diff[0]) if diff.size else 0 - print(f"[ERROR] Mismatch: idx={idx} golden={int(golden[idx])} out={int(output[idx])}") - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/compare-select/vcmp-lt/golden.py b/test/vpto/cases/micro-op/compare-select/vcmp-lt/golden.py deleted file mode 100644 index 6feb1da41..000000000 --- a/test/vpto/cases/micro-op/compare-select/vcmp-lt/golden.py +++ /dev/null @@ -1,56 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - - -import argparse -from pathlib import Path - -import numpy as np - - -LANES = 64 -SEED = 19 -OUTPUT_BYTES = 32 - - -def encode_b32_mask(mask: np.ndarray) -> np.ndarray: - out = np.zeros((OUTPUT_BYTES,), dtype=np.uint8) - for i, bit in enumerate(mask.astype(np.uint8, copy=False)): - if bit: - byte_index = i // 2 - nibble_shift = 4 * (i % 2) - out[byte_index] |= np.uint8(1 << nibble_shift) - return out - - -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) - v1 = rng.uniform(-3.0, 3.0, size=(LANES,)).astype(np.float32) - delta = rng.uniform(0.25, 1.25, size=(LANES,)).astype(np.float32) - choose_less = (np.arange(LANES, dtype=np.int32) % 2) == 0 - v2 = np.where(choose_less, v1 + delta, v1 - delta).astype(np.float32) - mask = np.less(v1, v2) - - output_dir.mkdir(parents=True, exist_ok=True) - v1.tofile(output_dir / "v1.bin") - v2.tofile(output_dir / "v2.bin") - np.zeros((OUTPUT_BYTES,), dtype=np.uint8).tofile(output_dir / "v3.bin") - encode_b32_mask(mask).tofile(output_dir / "golden_v3.bin") - - -def main() -> None: - parser = argparse.ArgumentParser(description="Generate inputs/golden for VPTO vcmp-lt.") - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/compare-select/vcmp-lt/kernel.pto b/test/vpto/cases/micro-op/compare-select/vcmp-lt/kernel.pto deleted file mode 100644 index 266c88ce4..000000000 --- a/test/vpto/cases/micro-op/compare-select/vcmp-lt/kernel.pto +++ /dev/null @@ -1,51 +0,0 @@ -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vcmp_lt_kernel_2d(%arg0: !pto.ptr, %arg1: !pto.ptr, - %arg2: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c64_i32 = arith.constant 64 : i32 - %c64_i64 = arith.constant 64 : i64 - %c256_i64 = arith.constant 256 : i64 - %c512_i64 = arith.constant 512 : i64 - %false = arith.constant false - - %ub_lhs = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_rhs = pto.castptr %c256_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c512_i64 : i64 -> !pto.ptr - pto.set_loop1_stride_outtoub %c256_i64, %c256_i64 : i64, i64 - pto.set_loop2_stride_outtoub %c256_i64, %c256_i64 : i64, i64 - pto.mte_gm_ub %arg0, %ub_lhs, %c0_i64, %c256_i64 - nburst(%c64_i64, %c256_i64, %c256_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - pto.mte_gm_ub %arg1, %ub_rhs, %c0_i64, %c256_i64 - nburst(%c64_i64, %c256_i64, %c256_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %_:1 = scf.for %iv = %c0 to %c1 step %c1 iter_args(%remaining = %c64_i32) -> (i32) { - %active, %next = pto.plt_b32 %remaining : i32 -> !pto.mask, i32 - %lhs = pto.vlds %ub_lhs[%c0] : !pto.ptr -> !pto.vreg<64xf32> - %rhs = pto.vlds %ub_rhs[%c0] : !pto.ptr -> !pto.vreg<64xf32> - %pred = pto.vcmp %lhs, %rhs, %active, "lt" : !pto.vreg<64xf32>, !pto.vreg<64xf32>, !pto.mask -> !pto.mask - pto.psts %pred, %ub_out[%c0], "NORM" : !pto.mask, !pto.ptr, index - scf.yield %next : i32 - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.set_loop1_stride_ubtoout %c32_i64, %c32_i64 : i64, i64 - pto.set_loop2_stride_ubtoout %c32_i64, %c32_i64 : i64, i64 - pto.mte_ub_gm %ub_out, %arg2, %c32_i64 - nburst(%c32_i64, %c32_i64, %c32_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/compare-select/vcmp-lt/launch.cpp b/test/vpto/cases/micro-op/compare-select/vcmp-lt/launch.cpp deleted file mode 100644 index 2762499e2..000000000 --- a/test/vpto/cases/micro-op/compare-select/vcmp-lt/launch.cpp +++ /dev/null @@ -1,51 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// --------------------------------------------------------------------------- -// PTOAS compatibility layer -// --------------------------------------------------------------------------- -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include - -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vcmp_lt_kernel_2d(__gm__ float *v1, - __gm__ float *v2, - __gm__ unsigned char *v3); - -void LaunchVcmp_lt_kernel_2d(float *v1, float *v2, unsigned char *v3, - void *stream) { - vcmp_lt_kernel_2d<<<1, nullptr, stream>>>((__gm__ float *)v1, - (__gm__ float *)v2, - (__gm__ unsigned char *)v3); -} diff --git a/test/vpto/cases/micro-op/compare-select/vcmp-lt/main.cpp b/test/vpto/cases/micro-op/compare-select/vcmp-lt/main.cpp deleted file mode 100644 index fa06a715f..000000000 --- a/test/vpto/cases/micro-op/compare-select/vcmp-lt/main.cpp +++ /dev/null @@ -1,103 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include - -using namespace PtoTestCommon; - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, \ - (int)_ret, __FILE__, __LINE__); \ - const char *_recent = aclGetRecentErrMsg(); \ - if (_recent != nullptr && _recent[0] != '\0') \ - std::fprintf(stderr, "[ERROR] RecentErrMsg: %s\n", _recent); \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVcmp_lt_kernel_2d(float *v1, float *v2, unsigned char *v3, - void *stream); - -int main() { - size_t elemCount_v1 = 64; - size_t fileSize_v1 = elemCount_v1 * sizeof(float); - size_t elemCount_v2 = 64; - size_t fileSize_v2 = elemCount_v2 * sizeof(float); - size_t elemCount_v3 = 32; - size_t fileSize_v3 = elemCount_v3 * sizeof(unsigned char); - float *v1Host = nullptr; - float *v1Device = nullptr; - float *v2Host = nullptr; - float *v2Device = nullptr; - unsigned char *v3Host = nullptr; - unsigned char *v3Device = nullptr; - - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) - deviceId = std::atoi(envDevice); - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMallocHost((void **)(&v3Host), fileSize_v3)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v3Device, fileSize_v3, ACL_MEM_MALLOC_HUGE_FIRST)); - - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ReadFile("./v3.bin", fileSize_v3, v3Host, fileSize_v3); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v3Device, fileSize_v3, v3Host, fileSize_v3, - ACL_MEMCPY_HOST_TO_DEVICE)); - - LaunchVcmp_lt_kernel_2d(v1Device, v2Device, v3Device, stream); - - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v3Host, fileSize_v3, v3Device, fileSize_v3, - ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v3.bin", v3Host, fileSize_v3); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFree(v3Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - aclrtFreeHost(v3Host); - if (stream != nullptr) - aclrtDestroyStream(stream); - if (deviceSet) - aclrtResetDevice(deviceId); - if (aclInited) - aclFinalize(); - return rc; -} diff --git a/test/vpto/cases/micro-op/compare-select/vcmp-tail/compare.py b/test/vpto/cases/micro-op/compare-select/vcmp-tail/compare.py deleted file mode 100644 index a872552e3..000000000 --- a/test/vpto/cases/micro-op/compare-select/vcmp-tail/compare.py +++ /dev/null @@ -1,35 +0,0 @@ -#!/usr/bin/python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - - -import os -import sys -import numpy as np - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - golden = np.fromfile("golden_v3.bin", dtype=np.uint8) - output = np.fromfile("v3.bin", dtype=np.uint8) - ok = golden.size >= 32 and output.size >= 32 and np.array_equal(golden[:32], output[:32]) - if not ok: - if golden.size and output.size: - diff = np.nonzero(golden[:32] != output[:32])[0] - idx = int(diff[0]) if diff.size else 0 - print(f"[ERROR] Mismatch: idx={idx} golden={int(golden[idx])} out={int(output[idx])}") - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/compare-select/vcmp-tail/golden.py b/test/vpto/cases/micro-op/compare-select/vcmp-tail/golden.py deleted file mode 100644 index f59aded57..000000000 --- a/test/vpto/cases/micro-op/compare-select/vcmp-tail/golden.py +++ /dev/null @@ -1,79 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - - -import argparse -from pathlib import Path - -import numpy as np - - -LANES = 64 -LOGICAL_ELEMS = 53 -SEED = 19 -OUTPUT_BYTES = 32 - - -def encode_b32_mask(mask: np.ndarray) -> np.ndarray: - out = np.zeros((OUTPUT_BYTES,), dtype=np.uint8) - for i, bit in enumerate(mask.astype(np.uint8, copy=False)): - if bit: - byte_index = i // 2 - nibble_shift = 4 * (i % 2) - out[byte_index] |= np.uint8(1 << nibble_shift) - return out - - -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) - v1 = rng.uniform(-6.0, 6.0, size=(LANES,)).astype(np.float32) - delta = rng.uniform(0.1, 2.0, size=(LANES,)).astype(np.float32) - mode = np.arange(LANES, dtype=np.int32) % 5 - - v2 = np.empty((LANES,), dtype=np.float32) - v2[mode == 0] = v1[mode == 0] + delta[mode == 0] - v2[mode == 1] = v1[mode == 1] - delta[mode == 1] - v2[mode == 2] = v1[mode == 2] - v2[mode == 3] = np.nextafter(v1[mode == 3], np.float32(np.inf)) - v2[mode == 4] = np.nextafter(v1[mode == 4], np.float32(-np.inf)) - - v1[:10] = np.array([-3.0, -1.0, -0.0, 0.0, 0.25, 1.0, 2.0, 4.0, -4.0, 6.0], dtype=np.float32) - v2[:10] = np.array([ - -2.0, - -2.0, - 0.0, - np.nextafter(np.float32(0.0), np.float32(np.inf)), - 0.25, - np.nextafter(np.float32(1.0), np.float32(-np.inf)), - 3.0, - 3.0, - np.nextafter(np.float32(-4.0), np.float32(np.inf)), - 6.0, - ], dtype=np.float32) - - mask = np.less(v1, v2) - mask[LOGICAL_ELEMS:] = False - - output_dir.mkdir(parents=True, exist_ok=True) - v1.tofile(output_dir / "v1.bin") - v2.tofile(output_dir / "v2.bin") - np.zeros((OUTPUT_BYTES,), dtype=np.uint8).tofile(output_dir / "v3.bin") - encode_b32_mask(mask).tofile(output_dir / "golden_v3.bin") - - -def main() -> None: - parser = argparse.ArgumentParser(description="Generate inputs/golden for VPTO vcmp-tail.") - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/compare-select/vcmp-tail/kernel.pto b/test/vpto/cases/micro-op/compare-select/vcmp-tail/kernel.pto deleted file mode 100644 index fe2d00b22..000000000 --- a/test/vpto/cases/micro-op/compare-select/vcmp-tail/kernel.pto +++ /dev/null @@ -1,51 +0,0 @@ -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vcmp_tail_kernel_2d(%arg0: !pto.ptr, %arg1: !pto.ptr, - %arg2: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c53_i32 = arith.constant 53 : i32 - %c64_i64 = arith.constant 64 : i64 - %c256_i64 = arith.constant 256 : i64 - %c512_i64 = arith.constant 512 : i64 - %false = arith.constant false - - %ub_lhs = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_rhs = pto.castptr %c256_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c512_i64 : i64 -> !pto.ptr - pto.set_loop1_stride_outtoub %c256_i64, %c256_i64 : i64, i64 - pto.set_loop2_stride_outtoub %c256_i64, %c256_i64 : i64, i64 - pto.mte_gm_ub %arg0, %ub_lhs, %c0_i64, %c256_i64 - nburst(%c64_i64, %c256_i64, %c256_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - pto.mte_gm_ub %arg1, %ub_rhs, %c0_i64, %c256_i64 - nburst(%c64_i64, %c256_i64, %c256_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %_:1 = scf.for %iv = %c0 to %c1 step %c1 iter_args(%remaining = %c53_i32) -> (i32) { - %active, %next = pto.plt_b32 %remaining : i32 -> !pto.mask, i32 - %lhs = pto.vlds %ub_lhs[%c0] : !pto.ptr -> !pto.vreg<64xf32> - %rhs = pto.vlds %ub_rhs[%c0] : !pto.ptr -> !pto.vreg<64xf32> - %pred = pto.vcmp %lhs, %rhs, %active, "lt" : !pto.vreg<64xf32>, !pto.vreg<64xf32>, !pto.mask -> !pto.mask - pto.psts %pred, %ub_out[%c0], "NORM" : !pto.mask, !pto.ptr, index - scf.yield %next : i32 - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.set_loop1_stride_ubtoout %c32_i64, %c32_i64 : i64, i64 - pto.set_loop2_stride_ubtoout %c32_i64, %c32_i64 : i64, i64 - pto.mte_ub_gm %ub_out, %arg2, %c32_i64 - nburst(%c32_i64, %c32_i64, %c32_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/compare-select/vcmp-tail/launch.cpp b/test/vpto/cases/micro-op/compare-select/vcmp-tail/launch.cpp deleted file mode 100644 index c57830b58..000000000 --- a/test/vpto/cases/micro-op/compare-select/vcmp-tail/launch.cpp +++ /dev/null @@ -1,51 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// --------------------------------------------------------------------------- -// PTOAS compatibility layer -// --------------------------------------------------------------------------- -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include - -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vcmp_tail_kernel_2d(__gm__ float *v1, - __gm__ float *v2, - __gm__ unsigned char *v3); - -void LaunchVcmp_tail_kernel_2d(float *v1, float *v2, unsigned char *v3, - void *stream) { - vcmp_tail_kernel_2d<<<1, nullptr, stream>>>((__gm__ float *)v1, - (__gm__ float *)v2, - (__gm__ unsigned char *)v3); -} diff --git a/test/vpto/cases/micro-op/compare-select/vcmp-tail/main.cpp b/test/vpto/cases/micro-op/compare-select/vcmp-tail/main.cpp deleted file mode 100644 index ee8661a62..000000000 --- a/test/vpto/cases/micro-op/compare-select/vcmp-tail/main.cpp +++ /dev/null @@ -1,103 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include - -using namespace PtoTestCommon; - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, \ - (int)_ret, __FILE__, __LINE__); \ - const char *_recent = aclGetRecentErrMsg(); \ - if (_recent != nullptr && _recent[0] != '\0') \ - std::fprintf(stderr, "[ERROR] RecentErrMsg: %s\n", _recent); \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVcmp_tail_kernel_2d(float *v1, float *v2, unsigned char *v3, - void *stream); - -int main() { - size_t elemCount_v1 = 64; - size_t fileSize_v1 = elemCount_v1 * sizeof(float); - size_t elemCount_v2 = 64; - size_t fileSize_v2 = elemCount_v2 * sizeof(float); - size_t elemCount_v3 = 32; - size_t fileSize_v3 = elemCount_v3 * sizeof(unsigned char); - float *v1Host = nullptr; - float *v1Device = nullptr; - float *v2Host = nullptr; - float *v2Device = nullptr; - unsigned char *v3Host = nullptr; - unsigned char *v3Device = nullptr; - - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) - deviceId = std::atoi(envDevice); - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMallocHost((void **)(&v3Host), fileSize_v3)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v3Device, fileSize_v3, ACL_MEM_MALLOC_HUGE_FIRST)); - - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ReadFile("./v3.bin", fileSize_v3, v3Host, fileSize_v3); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v3Device, fileSize_v3, v3Host, fileSize_v3, - ACL_MEMCPY_HOST_TO_DEVICE)); - - LaunchVcmp_tail_kernel_2d(v1Device, v2Device, v3Device, stream); - - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v3Host, fileSize_v3, v3Device, fileSize_v3, - ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v3.bin", v3Host, fileSize_v3); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFree(v3Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - aclrtFreeHost(v3Host); - if (stream != nullptr) - aclrtDestroyStream(stream); - if (deviceSet) - aclrtResetDevice(deviceId); - if (aclInited) - aclFinalize(); - return rc; -} diff --git a/test/vpto/cases/micro-op/compare-select/vcmps-f32-exceptional/compare.py b/test/vpto/cases/micro-op/compare-select/vcmps-f32-exceptional/compare.py deleted file mode 100644 index bc2a4827f..000000000 --- a/test/vpto/cases/micro-op/compare-select/vcmps-f32-exceptional/compare.py +++ /dev/null @@ -1,45 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - - -import os -import sys -import numpy as np - - -def compare_mask(golden_path, output_path): - if not os.path.exists(golden_path) or not os.path.exists(output_path): - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden.shape} vs {output.shape}") - return False - if not np.array_equal(golden, output): - diff = np.nonzero(golden != output)[0] - idx = int(diff[0]) if diff.size else 0 - print(f"[ERROR] Mismatch (packed mask): idx={idx} golden={int(golden[idx])} out={int(output[idx])}") - return False - return True - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_mask("golden_v2.bin", "v2.bin") - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/compare-select/vcmps-f32-exceptional/golden.py b/test/vpto/cases/micro-op/compare-select/vcmps-f32-exceptional/golden.py deleted file mode 100644 index d2ca06dc2..000000000 --- a/test/vpto/cases/micro-op/compare-select/vcmps-f32-exceptional/golden.py +++ /dev/null @@ -1,57 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - - -import argparse -from pathlib import Path - -import numpy as np - - -LANES = 64 -SEED = 19 -THRESHOLD = np.float32(0.5) -OUTPUT_BYTES = 32 - - -def encode_b32_mask(mask: np.ndarray) -> np.ndarray: - out = np.zeros((OUTPUT_BYTES,), dtype=np.uint8) - for i, bit in enumerate(mask.astype(np.uint8, copy=False)): - if bit: - byte_index = i // 2 - nibble_shift = 4 * (i % 2) - out[byte_index] |= np.uint8(1 << nibble_shift) - return out - - -def generate(output_dir: Path, seed: int) -> None: - del seed - specials = np.array( - [-np.inf, -1.0, -0.0, 0.0, 0.5, 0.75, np.inf, np.nan], - dtype=np.float32, - ) - v1 = np.resize(specials, LANES).astype(np.float32) - mask = np.greater(v1, THRESHOLD) - - output_dir.mkdir(parents=True, exist_ok=True) - v1.tofile(output_dir / "v1.bin") - np.zeros((OUTPUT_BYTES,), dtype=np.uint8).tofile(output_dir / "v2.bin") - encode_b32_mask(mask).tofile(output_dir / "golden_v2.bin") - - -def main() -> None: - parser = argparse.ArgumentParser(description="Generate inputs/golden for VPTO vcmps-f32-exceptional.") - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/compare-select/vcmps-f32-exceptional/kernel.pto b/test/vpto/cases/micro-op/compare-select/vcmps-f32-exceptional/kernel.pto deleted file mode 100644 index f61d05eb9..000000000 --- a/test/vpto/cases/micro-op/compare-select/vcmps-f32-exceptional/kernel.pto +++ /dev/null @@ -1,46 +0,0 @@ -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vcmps_f32_exceptional_kernel_2d(%arg0: !pto.ptr, %arg1: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c64_i32 = arith.constant 64 : i32 - %c64_i64 = arith.constant 64 : i64 - %c128_i64 = arith.constant 128 : i64 - %c256_i64 = arith.constant 256 : i64 - %threshold = arith.constant 5.000000e-01 : f32 - %false = arith.constant false - - %ub_src = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c256_i64 : i64 -> !pto.ptr - pto.set_loop1_stride_outtoub %c256_i64, %c256_i64 : i64, i64 - pto.set_loop2_stride_outtoub %c256_i64, %c256_i64 : i64, i64 - pto.mte_gm_ub %arg0, %ub_src, %c0_i64, %c256_i64 - nburst(%c64_i64, %c256_i64, %c256_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %_:1 = scf.for %iv = %c0 to %c1 step %c1 iter_args(%remaining = %c64_i32) -> (i32) { - %active, %next = pto.plt_b32 %remaining : i32 -> !pto.mask, i32 - %src = pto.vlds %ub_src[%c0] : !pto.ptr -> !pto.vreg<64xf32> - %pred = pto.vcmps %src, %threshold, %active, "gt" : !pto.vreg<64xf32>, f32, !pto.mask -> !pto.mask - pto.psts %pred, %ub_out[%c0], "NORM" : !pto.mask, !pto.ptr, index - scf.yield %next : i32 - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.set_loop1_stride_ubtoout %c32_i64, %c32_i64 : i64, i64 - pto.set_loop2_stride_ubtoout %c32_i64, %c32_i64 : i64, i64 - pto.mte_ub_gm %ub_out, %arg1, %c32_i64 - nburst(%c32_i64, %c32_i64, %c32_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/compare-select/vcmps-f32-exceptional/launch.cpp b/test/vpto/cases/micro-op/compare-select/vcmps-f32-exceptional/launch.cpp deleted file mode 100644 index e96d87fec..000000000 --- a/test/vpto/cases/micro-op/compare-select/vcmps-f32-exceptional/launch.cpp +++ /dev/null @@ -1,49 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// --------------------------------------------------------------------------- -// PTOAS compatibility layer -// --------------------------------------------------------------------------- -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include - -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vcmps_f32_exceptional_kernel_2d(__gm__ float *v1, - __gm__ unsigned char *v2); - -void LaunchVcmps_f32_exceptional_kernel_2d(float *v1, unsigned char *v2, - void *stream) { - vcmps_f32_exceptional_kernel_2d<<<1, nullptr, stream>>>((__gm__ float *)v1, - (__gm__ unsigned char *)v2); -} diff --git a/test/vpto/cases/micro-op/compare-select/vcmps-f32-exceptional/main.cpp b/test/vpto/cases/micro-op/compare-select/vcmps-f32-exceptional/main.cpp deleted file mode 100644 index d8d7a33b6..000000000 --- a/test/vpto/cases/micro-op/compare-select/vcmps-f32-exceptional/main.cpp +++ /dev/null @@ -1,92 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include - -using namespace PtoTestCommon; - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, \ - (int)_ret, __FILE__, __LINE__); \ - const char *_recent = aclGetRecentErrMsg(); \ - if (_recent != nullptr && _recent[0] != '\0') \ - std::fprintf(stderr, "[ERROR] RecentErrMsg: %s\n", _recent); \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVcmps_f32_exceptional_kernel_2d(float *v1, unsigned char *v2, - void *stream); - -int main() { - size_t elemCount_v1 = 64; - size_t fileSize_v1 = elemCount_v1 * sizeof(float); - size_t elemCount_v2 = 32; - size_t fileSize_v2 = elemCount_v2 * sizeof(unsigned char); - float *v1Host = nullptr; - float *v1Device = nullptr; - unsigned char *v2Host = nullptr; - unsigned char *v2Device = nullptr; - - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) - deviceId = std::atoi(envDevice); - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, - ACL_MEMCPY_HOST_TO_DEVICE)); - - LaunchVcmps_f32_exceptional_kernel_2d(v1Device, v2Device, stream); - - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v2Host, fileSize_v2, v2Device, fileSize_v2, - ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v2.bin", v2Host, fileSize_v2); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - if (stream != nullptr) - aclrtDestroyStream(stream); - if (deviceSet) - aclrtResetDevice(deviceId); - if (aclInited) - aclFinalize(); - return rc; -} diff --git a/test/vpto/cases/micro-op/compare-select/vcmps-f32/kernel.pto b/test/vpto/cases/micro-op/compare-select/vcmps-f32/kernel.pto index 448ad347e..8a3ef8f1c 100644 --- a/test/vpto/cases/micro-op/compare-select/vcmps-f32/kernel.pto +++ b/test/vpto/cases/micro-op/compare-select/vcmps-f32/kernel.pto @@ -41,6 +41,98 @@ module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind, !pto.ptr, i64, i64, i64, i64 pto.barrier #pto.pipe + %__case_merge_guard = arith.constant false + + // inactive cross-case merge from micro-op/compare-select/vcmps-f32-exceptional + scf.if %__case_merge_guard { + + %c0_cmg4_1 = arith.constant 0 : index + %c1_cmg4_1 = arith.constant 1 : index + %c0_i64_cmg4_1 = arith.constant 0 : i64 + %c1_i64_cmg4_1 = arith.constant 1 : i64 + %c32_i64_cmg4_1 = arith.constant 32 : i64 + %c64_i32_cmg4_1 = arith.constant 64 : i32 + %c64_i64_cmg4_1 = arith.constant 64 : i64 + %c128_i64_cmg4_1 = arith.constant 128 : i64 + %c256_i64_cmg4_1 = arith.constant 256 : i64 + %threshold_cmg4_1 = arith.constant 5.000000e-01 : f32 + %false_cmg4_1 = arith.constant false + + %ub_src_cmg4_1 = pto.castptr %c0_i64_cmg4_1 : i64 -> !pto.ptr + %ub_out_cmg4_1 = pto.castptr %c256_i64_cmg4_1 : i64 -> !pto.ptr + pto.set_loop1_stride_outtoub %c256_i64_cmg4_1, %c256_i64_cmg4_1 : i64, i64 + pto.set_loop2_stride_outtoub %c256_i64_cmg4_1, %c256_i64_cmg4_1 : i64, i64 + pto.mte_gm_ub %arg0, %ub_src_cmg4_1, %c0_i64_cmg4_1, %c256_i64_cmg4_1 + nburst(%c64_i64_cmg4_1, %c256_i64_cmg4_1, %c256_i64_cmg4_1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %__cmg4_1:1 = scf.for %iv_cmg4_1 = %c0_cmg4_1 to %c1_cmg4_1 step %c1_cmg4_1 iter_args(%remaining_cmg4_1 = %c64_i32_cmg4_1) -> (i32) { + %active_cmg4_1, %next_cmg4_1 = pto.plt_b32 %remaining_cmg4_1 : i32 -> !pto.mask, i32 + %src_cmg4_1 = pto.vlds %ub_src_cmg4_1[%c0_cmg4_1] : !pto.ptr -> !pto.vreg<64xf32> + %pred_cmg4_1 = pto.vcmps %src_cmg4_1, %threshold_cmg4_1, %active_cmg4_1, "gt" : !pto.vreg<64xf32>, f32, !pto.mask -> !pto.mask + pto.psts %pred_cmg4_1, %ub_out_cmg4_1[%c0_cmg4_1], "NORM" : !pto.mask, !pto.ptr, index + scf.yield %next_cmg4_1 : i32 + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.set_loop1_stride_ubtoout %c32_i64_cmg4_1, %c32_i64_cmg4_1 : i64, i64 + pto.set_loop2_stride_ubtoout %c32_i64_cmg4_1, %c32_i64_cmg4_1 : i64, i64 + pto.mte_ub_gm %ub_out_cmg4_1, %arg1, %c32_i64_cmg4_1 + nburst(%c32_i64_cmg4_1, %c32_i64_cmg4_1, %c32_i64_cmg4_1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + } + + // inactive cross-case merge from micro-op/compare-select/vcmps-tail + scf.if %__case_merge_guard { + + %c0_cmg4_2 = arith.constant 0 : index + %c1_cmg4_2 = arith.constant 1 : index + %c0_i64_cmg4_2 = arith.constant 0 : i64 + %c1_i64_cmg4_2 = arith.constant 1 : i64 + %c32_i64_cmg4_2 = arith.constant 32 : i64 + %c40_i32_cmg4_2 = arith.constant 40 : i32 + %c64_i64_cmg4_2 = arith.constant 64 : i64 + %c256_i64_cmg4_2 = arith.constant 256 : i64 + %threshold_cmg4_2 = arith.constant 5.000000e-01 : f32 + %false_cmg4_2 = arith.constant false + + %ub_src_cmg4_2 = pto.castptr %c0_i64_cmg4_2 : i64 -> !pto.ptr + %ub_out_cmg4_2 = pto.castptr %c256_i64_cmg4_2 : i64 -> !pto.ptr + pto.set_loop1_stride_outtoub %c256_i64_cmg4_2, %c256_i64_cmg4_2 : i64, i64 + pto.set_loop2_stride_outtoub %c256_i64_cmg4_2, %c256_i64_cmg4_2 : i64, i64 + pto.mte_gm_ub %arg0, %ub_src_cmg4_2, %c0_i64_cmg4_2, %c256_i64_cmg4_2 + nburst(%c64_i64_cmg4_2, %c256_i64_cmg4_2, %c256_i64_cmg4_2) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %__cmg4_2:1 = scf.for %iv_cmg4_2 = %c0_cmg4_2 to %c1_cmg4_2 step %c1_cmg4_2 iter_args(%remaining_cmg4_2 = %c40_i32_cmg4_2) -> (i32) { + %active_cmg4_2, %next_cmg4_2 = pto.plt_b32 %remaining_cmg4_2 : i32 -> !pto.mask, i32 + %src_cmg4_2 = pto.vlds %ub_src_cmg4_2[%c0_cmg4_2] : !pto.ptr -> !pto.vreg<64xf32> + %pred_cmg4_2 = pto.vcmps %src_cmg4_2, %threshold_cmg4_2, %active_cmg4_2, "gt" : !pto.vreg<64xf32>, f32, !pto.mask -> !pto.mask + pto.psts %pred_cmg4_2, %ub_out_cmg4_2[%c0_cmg4_2], "NORM" : !pto.mask, !pto.ptr, index + scf.yield %next_cmg4_2 : i32 + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.set_loop1_stride_ubtoout %c32_i64_cmg4_2, %c32_i64_cmg4_2 : i64, i64 + pto.set_loop2_stride_ubtoout %c32_i64_cmg4_2, %c32_i64_cmg4_2 : i64, i64 + pto.mte_ub_gm %ub_out_cmg4_2, %arg1, %c32_i64_cmg4_2 + nburst(%c32_i64_cmg4_2, %c32_i64_cmg4_2, %c32_i64_cmg4_2) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + } return } } diff --git a/test/vpto/cases/micro-op/compare-select/vcmps-tail/compare.py b/test/vpto/cases/micro-op/compare-select/vcmps-tail/compare.py deleted file mode 100644 index bc2a4827f..000000000 --- a/test/vpto/cases/micro-op/compare-select/vcmps-tail/compare.py +++ /dev/null @@ -1,45 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - - -import os -import sys -import numpy as np - - -def compare_mask(golden_path, output_path): - if not os.path.exists(golden_path) or not os.path.exists(output_path): - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden.shape} vs {output.shape}") - return False - if not np.array_equal(golden, output): - diff = np.nonzero(golden != output)[0] - idx = int(diff[0]) if diff.size else 0 - print(f"[ERROR] Mismatch (packed mask): idx={idx} golden={int(golden[idx])} out={int(output[idx])}") - return False - return True - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_mask("golden_v2.bin", "v2.bin") - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/compare-select/vcmps-tail/golden.py b/test/vpto/cases/micro-op/compare-select/vcmps-tail/golden.py deleted file mode 100644 index e36631b9a..000000000 --- a/test/vpto/cases/micro-op/compare-select/vcmps-tail/golden.py +++ /dev/null @@ -1,71 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - - -import argparse -from pathlib import Path - -import numpy as np - - -LANES = 64 -LOGICAL_ELEMS = 40 -SEED = 19 -THRESHOLD = np.float32(0.5) -OUTPUT_BYTES = 32 - - -def encode_b32_mask(mask: np.ndarray) -> np.ndarray: - out = np.zeros((OUTPUT_BYTES,), dtype=np.uint8) - for i, bit in enumerate(mask.astype(np.uint8, copy=False)): - if bit: - byte_index = i // 2 - nibble_shift = 4 * (i % 2) - out[byte_index] |= np.uint8(1 << nibble_shift) - return out - - -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) - v1 = rng.uniform(-2.0, 2.0, size=(LANES,)).astype(np.float32) - - v1[:12] = np.array([ - THRESHOLD, - np.nextafter(THRESHOLD, np.float32(np.inf)), - np.nextafter(THRESHOLD, np.float32(-np.inf)), - 0.0, - -0.0, - -1.0, - 1.0, - 2.0, - -2.0, - THRESHOLD + np.float32(0.25), - THRESHOLD - np.float32(0.25), - THRESHOLD, - ], dtype=np.float32) - - mask = np.greater(v1, THRESHOLD) - mask[LOGICAL_ELEMS:] = False - - output_dir.mkdir(parents=True, exist_ok=True) - v1.tofile(output_dir / "v1.bin") - np.zeros((OUTPUT_BYTES,), dtype=np.uint8).tofile(output_dir / "v2.bin") - encode_b32_mask(mask).tofile(output_dir / "golden_v2.bin") - - -def main() -> None: - parser = argparse.ArgumentParser(description="Generate inputs/golden for VPTO vcmps-tail.") - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/compare-select/vcmps-tail/kernel.pto b/test/vpto/cases/micro-op/compare-select/vcmps-tail/kernel.pto deleted file mode 100644 index 7ed9ffeed..000000000 --- a/test/vpto/cases/micro-op/compare-select/vcmps-tail/kernel.pto +++ /dev/null @@ -1,45 +0,0 @@ -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vcmps_tail_kernel_2d(%arg0: !pto.ptr, %arg1: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c40_i32 = arith.constant 40 : i32 - %c64_i64 = arith.constant 64 : i64 - %c256_i64 = arith.constant 256 : i64 - %threshold = arith.constant 5.000000e-01 : f32 - %false = arith.constant false - - %ub_src = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c256_i64 : i64 -> !pto.ptr - pto.set_loop1_stride_outtoub %c256_i64, %c256_i64 : i64, i64 - pto.set_loop2_stride_outtoub %c256_i64, %c256_i64 : i64, i64 - pto.mte_gm_ub %arg0, %ub_src, %c0_i64, %c256_i64 - nburst(%c64_i64, %c256_i64, %c256_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %_:1 = scf.for %iv = %c0 to %c1 step %c1 iter_args(%remaining = %c40_i32) -> (i32) { - %active, %next = pto.plt_b32 %remaining : i32 -> !pto.mask, i32 - %src = pto.vlds %ub_src[%c0] : !pto.ptr -> !pto.vreg<64xf32> - %pred = pto.vcmps %src, %threshold, %active, "gt" : !pto.vreg<64xf32>, f32, !pto.mask -> !pto.mask - pto.psts %pred, %ub_out[%c0], "NORM" : !pto.mask, !pto.ptr, index - scf.yield %next : i32 - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.set_loop1_stride_ubtoout %c32_i64, %c32_i64 : i64, i64 - pto.set_loop2_stride_ubtoout %c32_i64, %c32_i64 : i64, i64 - pto.mte_ub_gm %ub_out, %arg1, %c32_i64 - nburst(%c32_i64, %c32_i64, %c32_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/compare-select/vcmps-tail/launch.cpp b/test/vpto/cases/micro-op/compare-select/vcmps-tail/launch.cpp deleted file mode 100644 index a210fc2fa..000000000 --- a/test/vpto/cases/micro-op/compare-select/vcmps-tail/launch.cpp +++ /dev/null @@ -1,48 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// --------------------------------------------------------------------------- -// PTOAS compatibility layer -// --------------------------------------------------------------------------- -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include - -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vcmps_tail_kernel_2d(__gm__ float *v1, - __gm__ unsigned char *v2); - -void LaunchVcmps_tail_kernel_2d(float *v1, unsigned char *v2, void *stream) { - vcmps_tail_kernel_2d<<<1, nullptr, stream>>>((__gm__ float *)v1, - (__gm__ unsigned char *)v2); -} diff --git a/test/vpto/cases/micro-op/compare-select/vcmps-tail/main.cpp b/test/vpto/cases/micro-op/compare-select/vcmps-tail/main.cpp deleted file mode 100644 index 941741c4b..000000000 --- a/test/vpto/cases/micro-op/compare-select/vcmps-tail/main.cpp +++ /dev/null @@ -1,91 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include - -using namespace PtoTestCommon; - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, \ - (int)_ret, __FILE__, __LINE__); \ - const char *_recent = aclGetRecentErrMsg(); \ - if (_recent != nullptr && _recent[0] != '\0') \ - std::fprintf(stderr, "[ERROR] RecentErrMsg: %s\n", _recent); \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVcmps_tail_kernel_2d(float *v1, unsigned char *v2, void *stream); - -int main() { - size_t elemCount_v1 = 64; - size_t fileSize_v1 = elemCount_v1 * sizeof(float); - size_t elemCount_v2 = 32; - size_t fileSize_v2 = elemCount_v2 * sizeof(unsigned char); - float *v1Host = nullptr; - float *v1Device = nullptr; - unsigned char *v2Host = nullptr; - unsigned char *v2Device = nullptr; - - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) - deviceId = std::atoi(envDevice); - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, - ACL_MEMCPY_HOST_TO_DEVICE)); - - LaunchVcmps_tail_kernel_2d(v1Device, v2Device, stream); - - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v2Host, fileSize_v2, v2Device, fileSize_v2, - ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v2.bin", v2Host, fileSize_v2); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - if (stream != nullptr) - aclrtDestroyStream(stream); - if (deviceSet) - aclrtResetDevice(deviceId); - if (aclInited) - aclFinalize(); - return rc; -} diff --git a/test/vpto/cases/micro-op/compare-select/vsel-tail/compare.py b/test/vpto/cases/micro-op/compare-select/vsel-tail/compare.py deleted file mode 100755 index 9965cdb63..000000000 --- a/test/vpto/cases/micro-op/compare-select/vsel-tail/compare.py +++ /dev/null @@ -1,54 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/compare-select/vsel-tail -# family: compare-select -# target_ops: pto.vsel -# scenarios: core-f32, tail-mask -# NOTE: bulk-generated coverage skeleton. - -import os -import sys -import numpy as np - -LOGICAL_ELEMS = 40 - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(golden_path) or not os.path.exists(output_path): - return False - golden = np.fromfile(golden_path, dtype=dtype) - output = np.fromfile(output_path, dtype=dtype) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden.shape} vs {output.shape}") - return False - golden = golden[:LOGICAL_ELEMS] - output = output[:LOGICAL_ELEMS] - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - diff = np.abs(golden.astype(np.float64) - output.astype(np.float64)) - idx = int(np.argmax(diff)) - print(f"[ERROR] Mismatch: idx={idx} golden={golden[idx]} out={output[idx]}") - return False - return True - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_bin("golden_v3.bin", "v3.bin", np.float32, 1e-6) - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/compare-select/vsel-tail/golden.py b/test/vpto/cases/micro-op/compare-select/vsel-tail/golden.py deleted file mode 100644 index a2d6807fa..000000000 --- a/test/vpto/cases/micro-op/compare-select/vsel-tail/golden.py +++ /dev/null @@ -1,48 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - - -import argparse -from pathlib import Path - -import numpy as np - - -LANES = 64 -SEED = 19 -LOGICAL_ELEMS = 40 -OUT_SENTINEL = np.float32(-123.25) - - -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) - v1 = rng.uniform(-3.0, 3.0, size=(LANES,)).astype(np.float32) - v2 = rng.uniform(-3.0, 3.0, size=(LANES,)).astype(np.float32) - golden_v3 = np.full((LANES,), OUT_SENTINEL, dtype=np.float32) - flat = np.where(v1 > v2, v1, v2).astype(np.float32, copy=False) - golden_v3[:LOGICAL_ELEMS] = flat[:LOGICAL_ELEMS] - v3 = np.full((LANES,), OUT_SENTINEL, dtype=np.float32) - - output_dir.mkdir(parents=True, exist_ok=True) - v1.tofile(output_dir / "v1.bin") - v2.tofile(output_dir / "v2.bin") - v3.tofile(output_dir / "v3.bin") - golden_v3.tofile(output_dir / "golden_v3.bin") - - -def main() -> None: - parser = argparse.ArgumentParser(description="Generate inputs/golden for VPTO vsel-tail.") - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/compare-select/vsel-tail/kernel.pto b/test/vpto/cases/micro-op/compare-select/vsel-tail/kernel.pto deleted file mode 100644 index 2237e5305..000000000 --- a/test/vpto/cases/micro-op/compare-select/vsel-tail/kernel.pto +++ /dev/null @@ -1,54 +0,0 @@ -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vsel_tail_kernel_2d(%arg0: !pto.ptr, %arg1: !pto.ptr, - %arg2: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c40_i32 = arith.constant 40 : i32 - %c64_i64 = arith.constant 64 : i64 - %c256_i64 = arith.constant 256 : i64 - %c512_i64 = arith.constant 512 : i64 - %false = arith.constant false - - %ub_lhs = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_rhs = pto.castptr %c256_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c512_i64 : i64 -> !pto.ptr - pto.set_loop1_stride_outtoub %c256_i64, %c256_i64 : i64, i64 - pto.set_loop2_stride_outtoub %c256_i64, %c256_i64 : i64, i64 - pto.mte_gm_ub %arg0, %ub_lhs, %c0_i64, %c256_i64 - nburst(%c64_i64, %c256_i64, %c256_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - pto.mte_gm_ub %arg1, %ub_rhs, %c0_i64, %c256_i64 - nburst(%c64_i64, %c256_i64, %c256_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - pto.mte_gm_ub %arg2, %ub_out, %c0_i64, %c256_i64 - nburst(%c64_i64, %c256_i64, %c256_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %_:1 = scf.for %iv = %c0 to %c1 step %c1 iter_args(%remaining = %c40_i32) -> (i32) { - %active, %next = pto.plt_b32 %remaining : i32 -> !pto.mask, i32 - %lhs = pto.vlds %ub_lhs[%c0] : !pto.ptr -> !pto.vreg<64xf32> - %rhs = pto.vlds %ub_rhs[%c0] : !pto.ptr -> !pto.vreg<64xf32> - %pred = pto.vcmp %lhs, %rhs, %active, "gt" : !pto.vreg<64xf32>, !pto.vreg<64xf32>, !pto.mask -> !pto.mask - %out = pto.vsel %lhs, %rhs, %pred : !pto.vreg<64xf32>, !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> - pto.vsts %out, %ub_out[%c0], %active : !pto.vreg<64xf32>, !pto.ptr, !pto.mask - scf.yield %next : i32 - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.set_loop1_stride_ubtoout %c256_i64, %c256_i64 : i64, i64 - pto.set_loop2_stride_ubtoout %c256_i64, %c256_i64 : i64, i64 - pto.mte_ub_gm %ub_out, %arg2, %c256_i64 - nburst(%c64_i64, %c256_i64, %c256_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/compare-select/vsel-tail/launch.cpp b/test/vpto/cases/micro-op/compare-select/vsel-tail/launch.cpp deleted file mode 100644 index b4e0598e0..000000000 --- a/test/vpto/cases/micro-op/compare-select/vsel-tail/launch.cpp +++ /dev/null @@ -1,50 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// --------------------------------------------------------------------------- -// PTOAS compatibility layer -// --------------------------------------------------------------------------- -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include - -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vsel_tail_kernel_2d(__gm__ float *v1, - __gm__ float *v2, - __gm__ float *v3); - -void LaunchVsel_tail_kernel_2d(float *v1, float *v2, float *v3, void *stream) { - vsel_tail_kernel_2d<<<1, nullptr, stream>>>((__gm__ float *)v1, - (__gm__ float *)v2, - (__gm__ float *)v3); -} diff --git a/test/vpto/cases/micro-op/compare-select/vsel-tail/main.cpp b/test/vpto/cases/micro-op/compare-select/vsel-tail/main.cpp deleted file mode 100644 index 323131056..000000000 --- a/test/vpto/cases/micro-op/compare-select/vsel-tail/main.cpp +++ /dev/null @@ -1,102 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include - -using namespace PtoTestCommon; - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, \ - (int)_ret, __FILE__, __LINE__); \ - const char *_recent = aclGetRecentErrMsg(); \ - if (_recent != nullptr && _recent[0] != '\0') \ - std::fprintf(stderr, "[ERROR] RecentErrMsg: %s\n", _recent); \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVsel_tail_kernel_2d(float *v1, float *v2, float *v3, void *stream); - -int main() { - size_t elemCount_v1 = 64; - size_t fileSize_v1 = elemCount_v1 * sizeof(float); - size_t elemCount_v2 = 64; - size_t fileSize_v2 = elemCount_v2 * sizeof(float); - size_t elemCount_v3 = 64; - size_t fileSize_v3 = elemCount_v3 * sizeof(float); - float *v1Host = nullptr; - float *v1Device = nullptr; - float *v2Host = nullptr; - float *v2Device = nullptr; - float *v3Host = nullptr; - float *v3Device = nullptr; - - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) - deviceId = std::atoi(envDevice); - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMallocHost((void **)(&v3Host), fileSize_v3)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v3Device, fileSize_v3, ACL_MEM_MALLOC_HUGE_FIRST)); - - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ReadFile("./v3.bin", fileSize_v3, v3Host, fileSize_v3); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v3Device, fileSize_v3, v3Host, fileSize_v3, - ACL_MEMCPY_HOST_TO_DEVICE)); - - LaunchVsel_tail_kernel_2d(v1Device, v2Device, v3Device, stream); - - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v3Host, fileSize_v3, v3Device, fileSize_v3, - ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v3.bin", v3Host, fileSize_v3); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFree(v3Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - aclrtFreeHost(v3Host); - if (stream != nullptr) - aclrtDestroyStream(stream); - if (deviceSet) - aclrtResetDevice(deviceId); - if (aclInited) - aclFinalize(); - return rc; -} diff --git a/test/vpto/cases/micro-op/compare-select/vsel/kernel.pto b/test/vpto/cases/micro-op/compare-select/vsel/kernel.pto index c8ee34b29..12a4b6506 100644 --- a/test/vpto/cases/micro-op/compare-select/vsel/kernel.pto +++ b/test/vpto/cases/micro-op/compare-select/vsel/kernel.pto @@ -1,51 +1,107 @@ module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vsel_kernel_2d(%arg0: !pto.ptr, %arg1: !pto.ptr, - %arg2: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c64_i32 = arith.constant 64 : i32 - %c64_i64 = arith.constant 64 : i64 - %c256_i64 = arith.constant 256 : i64 - %c512_i64 = arith.constant 512 : i64 - %false = arith.constant false - - %ub_lhs = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_rhs = pto.castptr %c256_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c512_i64 : i64 -> !pto.ptr - pto.set_loop1_stride_outtoub %c256_i64, %c256_i64 : i64, i64 - pto.set_loop2_stride_outtoub %c256_i64, %c256_i64 : i64, i64 - pto.mte_gm_ub %arg0, %ub_lhs, %c0_i64, %c256_i64 - nburst(%c64_i64, %c256_i64, %c256_i64) + func.func @vsel_deep_merged_kernel(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: !pto.ptr, %arg4: !pto.ptr, %arg5: !pto.ptr) attributes {pto.kernel} { + %__deep_merge_guard = arith.constant false + // inactive merged from vsel_tail_kernel_2d + scf.if %__deep_merge_guard { + + %c0_m0 = arith.constant 0 : index + %c1_m0 = arith.constant 1 : index + %c0_i64_m0 = arith.constant 0 : i64 + %c1_i64_m0 = arith.constant 1 : i64 + %c40_i32_m0 = arith.constant 40 : i32 + %c64_i64_m0 = arith.constant 64 : i64 + %c256_i64_m0 = arith.constant 256 : i64 + %c512_i64_m0 = arith.constant 512 : i64 + %false_m0 = arith.constant false + + %ub_lhs_m0 = pto.castptr %c0_i64_m0 : i64 -> !pto.ptr + %ub_rhs_m0 = pto.castptr %c256_i64_m0 : i64 -> !pto.ptr + %ub_out_m0 = pto.castptr %c512_i64_m0 : i64 -> !pto.ptr + pto.set_loop1_stride_outtoub %c256_i64_m0, %c256_i64_m0 : i64, i64 + pto.set_loop2_stride_outtoub %c256_i64_m0, %c256_i64_m0 : i64, i64 + pto.mte_gm_ub %arg0, %ub_lhs_m0, %c0_i64_m0, %c256_i64_m0 + nburst(%c64_i64_m0, %c256_i64_m0, %c256_i64_m0) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + pto.mte_gm_ub %arg1, %ub_rhs_m0, %c0_i64_m0, %c256_i64_m0 + nburst(%c64_i64_m0, %c256_i64_m0, %c256_i64_m0) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + pto.mte_gm_ub %arg2, %ub_out_m0, %c0_i64_m0, %c256_i64_m0 + nburst(%c64_i64_m0, %c256_i64_m0, %c256_i64_m0) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %__m0:1 = scf.for %iv_m0 = %c0_m0 to %c1_m0 step %c1_m0 iter_args(%remaining_m0 = %c40_i32_m0) -> (i32) { + %active_m0, %next_m0 = pto.plt_b32 %remaining_m0 : i32 -> !pto.mask, i32 + %lhs_m0 = pto.vlds %ub_lhs_m0[%c0_m0] : !pto.ptr -> !pto.vreg<64xf32> + %rhs_m0 = pto.vlds %ub_rhs_m0[%c0_m0] : !pto.ptr -> !pto.vreg<64xf32> + %pred_m0 = pto.vcmp %lhs_m0, %rhs_m0, %active_m0, "gt" : !pto.vreg<64xf32>, !pto.vreg<64xf32>, !pto.mask -> !pto.mask + %out_m0 = pto.vsel %lhs_m0, %rhs_m0, %pred_m0 : !pto.vreg<64xf32>, !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> + pto.vsts %out_m0, %ub_out_m0[%c0_m0], %active_m0 : !pto.vreg<64xf32>, !pto.ptr, !pto.mask + scf.yield %next_m0 : i32 + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.set_loop1_stride_ubtoout %c256_i64_m0, %c256_i64_m0 : i64, i64 + pto.set_loop2_stride_ubtoout %c256_i64_m0, %c256_i64_m0 : i64, i64 + pto.mte_ub_gm %ub_out_m0, %arg2, %c256_i64_m0 + nburst(%c64_i64_m0, %c256_i64_m0, %c256_i64_m0) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + + } + // active merged from vsel_kernel_2d + + %c0_m1 = arith.constant 0 : index + %c1_m1 = arith.constant 1 : index + %c0_i64_m1 = arith.constant 0 : i64 + %c1_i64_m1 = arith.constant 1 : i64 + %c64_i32_m1 = arith.constant 64 : i32 + %c64_i64_m1 = arith.constant 64 : i64 + %c256_i64_m1 = arith.constant 256 : i64 + %c512_i64_m1 = arith.constant 512 : i64 + %false_m1 = arith.constant false + + %ub_lhs_m1 = pto.castptr %c0_i64_m1 : i64 -> !pto.ptr + %ub_rhs_m1 = pto.castptr %c256_i64_m1 : i64 -> !pto.ptr + %ub_out_m1 = pto.castptr %c512_i64_m1 : i64 -> !pto.ptr + pto.set_loop1_stride_outtoub %c256_i64_m1, %c256_i64_m1 : i64, i64 + pto.set_loop2_stride_outtoub %c256_i64_m1, %c256_i64_m1 : i64, i64 + pto.mte_gm_ub %arg3, %ub_lhs_m1, %c0_i64_m1, %c256_i64_m1 + nburst(%c64_i64_m1, %c256_i64_m1, %c256_i64_m1) : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - pto.mte_gm_ub %arg1, %ub_rhs, %c0_i64, %c256_i64 - nburst(%c64_i64, %c256_i64, %c256_i64) + pto.mte_gm_ub %arg4, %ub_rhs_m1, %c0_i64_m1, %c256_i64_m1 + nburst(%c64_i64_m1, %c256_i64_m1, %c256_i64_m1) : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] pto.vecscope { - %_:1 = scf.for %iv = %c0 to %c1 step %c1 iter_args(%remaining = %c64_i32) -> (i32) { - %active, %next = pto.plt_b32 %remaining : i32 -> !pto.mask, i32 - %lhs = pto.vlds %ub_lhs[%c0] : !pto.ptr -> !pto.vreg<64xf32> - %rhs = pto.vlds %ub_rhs[%c0] : !pto.ptr -> !pto.vreg<64xf32> - %pred = pto.vcmp %lhs, %rhs, %active, "gt" : !pto.vreg<64xf32>, !pto.vreg<64xf32>, !pto.mask -> !pto.mask - %out = pto.vsel %lhs, %rhs, %pred : !pto.vreg<64xf32>, !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> - pto.vsts %out, %ub_out[%c0], %active : !pto.vreg<64xf32>, !pto.ptr, !pto.mask - scf.yield %next : i32 + %__m1:1 = scf.for %iv_m1 = %c0_m1 to %c1_m1 step %c1_m1 iter_args(%remaining_m1 = %c64_i32_m1) -> (i32) { + %active_m1, %next_m1 = pto.plt_b32 %remaining_m1 : i32 -> !pto.mask, i32 + %lhs_m1 = pto.vlds %ub_lhs_m1[%c0_m1] : !pto.ptr -> !pto.vreg<64xf32> + %rhs_m1 = pto.vlds %ub_rhs_m1[%c0_m1] : !pto.ptr -> !pto.vreg<64xf32> + %pred_m1 = pto.vcmp %lhs_m1, %rhs_m1, %active_m1, "gt" : !pto.vreg<64xf32>, !pto.vreg<64xf32>, !pto.mask -> !pto.mask + %out_m1 = pto.vsel %lhs_m1, %rhs_m1, %pred_m1 : !pto.vreg<64xf32>, !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> + pto.vsts %out_m1, %ub_out_m1[%c0_m1], %active_m1 : !pto.vreg<64xf32>, !pto.ptr, !pto.mask + scf.yield %next_m1 : i32 } } pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.set_loop1_stride_ubtoout %c256_i64, %c256_i64 : i64, i64 - pto.set_loop2_stride_ubtoout %c256_i64, %c256_i64 : i64, i64 - pto.mte_ub_gm %ub_out, %arg2, %c256_i64 - nburst(%c64_i64, %c256_i64, %c256_i64) + pto.set_loop1_stride_ubtoout %c256_i64_m1, %c256_i64_m1 : i64, i64 + pto.set_loop2_stride_ubtoout %c256_i64_m1, %c256_i64_m1 : i64, i64 + pto.mte_ub_gm %ub_out_m1, %arg5, %c256_i64_m1 + nburst(%c64_i64_m1, %c256_i64_m1, %c256_i64_m1) : !pto.ptr, !pto.ptr, i64, i64, i64, i64 pto.barrier #pto.pipe + return } } diff --git a/test/vpto/cases/micro-op/compare-select/vsel/launch.cpp b/test/vpto/cases/micro-op/compare-select/vsel/launch.cpp index 269405dee..2070538fc 100644 --- a/test/vpto/cases/micro-op/compare-select/vsel/launch.cpp +++ b/test/vpto/cases/micro-op/compare-select/vsel/launch.cpp @@ -5,14 +5,9 @@ // THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, // INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. // See LICENSE in the root of the software repository for the full text of the License. - -// --------------------------------------------------------------------------- -// PTOAS compatibility layer -// --------------------------------------------------------------------------- #ifndef __VEC_SCOPE__ #define __VEC_SCOPE__ #endif - #if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) typedef struct { unsigned char v; } hifloat8_t; typedef struct { unsigned char v; } float8_e4m3_t; @@ -22,29 +17,30 @@ typedef struct { unsigned char v; } float4_e1m2x2_t; typedef struct { unsigned char v; } float4_e2m1x2_t; #endif #include - #if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) #include #endif - #if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; +struct MrgSortExecutedNumList { uint16_t mrgSortList0,mrgSortList1,mrgSortList2,mrgSortList3; }; #endif #ifndef __CPU_SIM #include "acl/acl.h" #endif -extern "C" __global__ [aicore] void vsel_kernel_2d(__gm__ float *v1, - __gm__ float *v2, - __gm__ float *v3); +extern "C" __global__ [aicore] void vsel_deep_merged_kernel( + __gm__ float * arg0, + __gm__ float * arg1, + __gm__ float * arg2, + __gm__ float * arg3, + __gm__ float * arg4, + __gm__ float * arg5); -void LaunchVsel_kernel_2d(float *v1, float *v2, float *v3, void *stream) { - vsel_kernel_2d<<<1, nullptr, stream>>>((__gm__ float *)v1, - (__gm__ float *)v2, - (__gm__ float *)v3); +void LaunchVselDeepMerged(float * p0, float * p1, float * p2, void *stream) { + vsel_deep_merged_kernel<<<1, nullptr, stream>>>( + (__gm__ float *)p0, + (__gm__ float *)p0, + (__gm__ float *)p0, + (__gm__ float *)p0, + (__gm__ float *)p1, + (__gm__ float *)p2); } diff --git a/test/vpto/cases/micro-op/compare-select/vsel/main.cpp b/test/vpto/cases/micro-op/compare-select/vsel/main.cpp index cf71eb295..e5ddef090 100644 --- a/test/vpto/cases/micro-op/compare-select/vsel/main.cpp +++ b/test/vpto/cases/micro-op/compare-select/vsel/main.cpp @@ -31,8 +31,8 @@ using namespace PtoTestCommon; } \ } while (0) -void LaunchVsel_kernel_2d(float *v1, float *v2, float *v3, void *stream); +void LaunchVselDeepMerged(float * p0, float * p1, float * p2, void *stream); int main() { size_t elemCount_v1 = 64; size_t fileSize_v1 = elemCount_v1 * sizeof(float); @@ -78,7 +78,7 @@ int main() { ACL_CHECK(aclrtMemcpy(v3Device, fileSize_v3, v3Host, fileSize_v3, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVsel_kernel_2d(v1Device, v2Device, v3Device, stream); + LaunchVselDeepMerged(v1Device, v2Device, v3Device, stream); ACL_CHECK(aclrtSynchronizeStream(stream)); ACL_CHECK(aclrtMemcpy(v3Host, fileSize_v3, v3Device, fileSize_v3, diff --git a/test/vpto/cases/micro-op/compare-select/vselr-f16/compare.py b/test/vpto/cases/micro-op/compare-select/vselr-f16/compare.py deleted file mode 100644 index b961a3713..000000000 --- a/test/vpto/cases/micro-op/compare-select/vselr-f16/compare.py +++ /dev/null @@ -1,49 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/compare-select/vselr-f16 -# family: compare-select -# target_ops: pto.vselr -# scenarios: core-f16, full-mask, explicit-lane-index - -import os -import sys - -import numpy as np - - -def compare_tensor(golden_path: str, output_path: str) -> bool: - if not os.path.exists(golden_path) or not os.path.exists(output_path): - return False - golden = np.fromfile(golden_path, dtype=np.float16) - output = np.fromfile(output_path, dtype=np.float16) - if golden.shape != output.shape: - return False - if not np.allclose(golden, output, rtol=0.0, atol=0.0, equal_nan=True): - diff = np.nonzero(golden != output)[0] - idx = int(diff[0]) if diff.size else 0 - print(f"[ERROR] Mismatch: idx={idx} golden={golden[idx]} out={output[idx]}") - return False - return True - - -def main() -> None: - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_tensor("golden_v3.bin", "v3.bin") - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/compare-select/vselr-f16/golden.py b/test/vpto/cases/micro-op/compare-select/vselr-f16/golden.py deleted file mode 100644 index ae0513ecf..000000000 --- a/test/vpto/cases/micro-op/compare-select/vselr-f16/golden.py +++ /dev/null @@ -1,52 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/compare-select/vselr-f16 -# family: compare-select -# target_ops: pto.vselr -# scenarios: core-f16, full-mask, explicit-lane-index - -import argparse -from pathlib import Path - -import numpy as np - - -ROWS = 8 -COLS = 128 -SEED = 23 - - -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) - src = rng.uniform(-6.0, 6.0, size=(ROWS, COLS)).astype(np.float16, copy=False) - lane_ids = np.arange(COLS, dtype=np.uint16) - idx = np.empty((ROWS, COLS), dtype=np.uint16) - for row in range(ROWS): - idx[row] = (lane_ids[::-1] + row * 11 + (lane_ids % 7) * 3) % COLS - golden = np.take_along_axis(src, idx.astype(np.int64, copy=False), axis=1).astype(np.float16, copy=False) - out = np.zeros((ROWS, COLS), dtype=np.float16) - - output_dir.mkdir(parents=True, exist_ok=True) - src.view(np.uint16).reshape(-1).tofile(output_dir / "v1.bin") - idx.reshape(-1).tofile(output_dir / "v2.bin") - out.view(np.uint16).reshape(-1).tofile(output_dir / "v3.bin") - golden.view(np.uint16).reshape(-1).tofile(output_dir / "golden_v3.bin") - - -def main() -> None: - parser = argparse.ArgumentParser(description="Generate inputs/golden for vselr-f16.") - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/compare-select/vselr-f16/kernel.pto b/test/vpto/cases/micro-op/compare-select/vselr-f16/kernel.pto deleted file mode 100644 index 0c9bf3d51..000000000 --- a/test/vpto/cases/micro-op/compare-select/vselr-f16/kernel.pto +++ /dev/null @@ -1,54 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/compare-select/vselr-f16 -// family: compare-select -// target_ops: pto.vselr -// scenarios: core-f16, full-mask, explicit-lane-index -// ----------------------------------------------------------------------------- -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vselr_f16_kernel_2d(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c128 = arith.constant 128 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c8_i64 = arith.constant 8 : i64 - %c256_i64 = arith.constant 256 : i64 - %c2048_i64 = arith.constant 2048 : i64 - %c4096_i64 = arith.constant 4096 : i64 - %c1024_i32 = arith.constant 1024 : i32 - %false = arith.constant false - - %ub_src = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_idx = pto.castptr %c2048_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c4096_i64 : i64 -> !pto.ptr - pto.mte_gm_ub %arg0, %ub_src, %c0_i64, %c256_i64 - nburst(%c8_i64, %c256_i64, %c256_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - pto.mte_gm_ub %arg1, %ub_idx, %c0_i64, %c256_i64 - nburst(%c8_i64, %c256_i64, %c256_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %_:1 = scf.for %offset = %c0 to %c1024 step %c128 iter_args(%remaining = %c1024_i32) -> (i32) { - %mask, %next_remaining = pto.plt_b16 %remaining : i32 -> !pto.mask, i32 - %src = pto.vlds %ub_src[%offset] : !pto.ptr -> !pto.vreg<128xf16> - %idx = pto.vlds %ub_idx[%offset] : !pto.ptr -> !pto.vreg<128xui16> - %out = pto.vselr %src, %idx : !pto.vreg<128xf16>, !pto.vreg<128xui16> -> !pto.vreg<128xf16> - pto.vsts %out, %ub_out[%offset], %mask : !pto.vreg<128xf16>, !pto.ptr, !pto.mask - scf.yield %next_remaining : i32 - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg2, %c256_i64 - nburst(%c8_i64, %c256_i64, %c256_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/compare-select/vselr-f16/launch.cpp b/test/vpto/cases/micro-op/compare-select/vselr-f16/launch.cpp deleted file mode 100644 index f00e5672d..000000000 --- a/test/vpto/cases/micro-op/compare-select/vselr-f16/launch.cpp +++ /dev/null @@ -1,54 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/compare-select/vselr-f16 -// family: compare-select -// target_ops: pto.vselr -// scenarios: core-f16, full-mask, explicit-lane-index -// ----------------------------------------------------------------------------- -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include - -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vselr_f16_kernel_2d(__gm__ half *v1, - __gm__ uint16_t *v2, - __gm__ half *v3); - -void LaunchVselr_f16_kernel_2d(uint16_t *v1, uint16_t *v2, uint16_t *v3, - void *stream) { - vselr_f16_kernel_2d<<<1, nullptr, stream>>>((__gm__ half *)v1, - (__gm__ uint16_t *)v2, - (__gm__ half *)v3); -} diff --git a/test/vpto/cases/micro-op/compare-select/vselr-f16/main.cpp b/test/vpto/cases/micro-op/compare-select/vselr-f16/main.cpp deleted file mode 100644 index 2002d9ee5..000000000 --- a/test/vpto/cases/micro-op/compare-select/vselr-f16/main.cpp +++ /dev/null @@ -1,106 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/compare-select/vselr-f16 -// family: compare-select -// target_ops: pto.vselr -// scenarios: core-f16, full-mask, explicit-lane-index -// ----------------------------------------------------------------------------- -#include "test_common.h" -#include "acl/acl.h" -#include -#include -#include - -using namespace PtoTestCommon; - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, \ - (int)_ret, __FILE__, __LINE__); \ - const char *_recent = aclGetRecentErrMsg(); \ - if (_recent != nullptr && _recent[0] != '\0') \ - std::fprintf(stderr, "[ERROR] RecentErrMsg: %s\n", _recent); \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVselr_f16_kernel_2d(uint16_t *v1, uint16_t *v2, uint16_t *v3, - void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(uint16_t); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(uint16_t); - size_t elemCount_v3 = 1024; - size_t fileSize_v3 = elemCount_v3 * sizeof(uint16_t); - uint16_t *v1Host = nullptr; - uint16_t *v1Device = nullptr; - uint16_t *v2Host = nullptr; - uint16_t *v2Device = nullptr; - uint16_t *v3Host = nullptr; - uint16_t *v3Device = nullptr; - - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) - deviceId = std::atoi(envDevice); - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMallocHost((void **)(&v3Host), fileSize_v3)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v3Device, fileSize_v3, ACL_MEM_MALLOC_HUGE_FIRST)); - - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ReadFile("./v3.bin", fileSize_v3, v3Host, fileSize_v3); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v3Device, fileSize_v3, v3Host, fileSize_v3, - ACL_MEMCPY_HOST_TO_DEVICE)); - - LaunchVselr_f16_kernel_2d(v1Device, v2Device, v3Device, stream); - - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v3Host, fileSize_v3, v3Device, fileSize_v3, - ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v3.bin", v3Host, fileSize_v3); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFree(v3Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - aclrtFreeHost(v3Host); - if (stream != nullptr) - aclrtDestroyStream(stream); - if (deviceSet) - aclrtResetDevice(deviceId); - if (aclInited) - aclFinalize(); - return rc; -} diff --git a/test/vpto/cases/micro-op/compare-select/vselr-u8/compare.py b/test/vpto/cases/micro-op/compare-select/vselr-u8/compare.py deleted file mode 100644 index d48e1a42e..000000000 --- a/test/vpto/cases/micro-op/compare-select/vselr-u8/compare.py +++ /dev/null @@ -1,49 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/compare-select/vselr-u8 -# family: compare-select -# target_ops: pto.vselr -# scenarios: core-u8, full-mask, explicit-lane-index - -import os -import sys - -import numpy as np - - -def compare_tensor(golden_path: str, output_path: str) -> bool: - if not os.path.exists(golden_path) or not os.path.exists(output_path): - return False - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - if golden.shape != output.shape: - return False - if not np.array_equal(golden, output): - diff = np.nonzero(golden != output)[0] - idx = int(diff[0]) if diff.size else 0 - print(f"[ERROR] Mismatch: idx={idx} golden={int(golden[idx])} out={int(output[idx])}") - return False - return True - - -def main() -> None: - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_tensor("golden_v3.bin", "v3.bin") - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/compare-select/vselr-u8/golden.py b/test/vpto/cases/micro-op/compare-select/vselr-u8/golden.py deleted file mode 100644 index 2cb03404a..000000000 --- a/test/vpto/cases/micro-op/compare-select/vselr-u8/golden.py +++ /dev/null @@ -1,53 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/compare-select/vselr-u8 -# family: compare-select -# target_ops: pto.vselr -# scenarios: core-u8, full-mask, explicit-lane-index - -import argparse -from pathlib import Path - -import numpy as np - - -ROWS = 4 -COLS = 256 -SEED = 29 - - -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) - src = rng.integers(0, 256, size=(ROWS, COLS), dtype=np.uint8) - lane_ids = np.arange(COLS, dtype=np.uint16) - idx = np.empty((ROWS, COLS), dtype=np.uint8) - for row in range(ROWS): - row_idx = (lane_ids[::-1] + row * 19 + (lane_ids % 13) * 5) % COLS - idx[row] = row_idx.astype(np.uint8, copy=False) - golden = np.take_along_axis(src, idx.astype(np.int64, copy=False), axis=1).astype(np.uint8, copy=False) - out = np.zeros((ROWS, COLS), dtype=np.uint8) - - output_dir.mkdir(parents=True, exist_ok=True) - src.reshape(-1).tofile(output_dir / "v1.bin") - idx.reshape(-1).tofile(output_dir / "v2.bin") - out.reshape(-1).tofile(output_dir / "v3.bin") - golden.reshape(-1).tofile(output_dir / "golden_v3.bin") - - -def main() -> None: - parser = argparse.ArgumentParser(description="Generate inputs/golden for vselr-u8.") - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/compare-select/vselr-u8/kernel.pto b/test/vpto/cases/micro-op/compare-select/vselr-u8/kernel.pto deleted file mode 100644 index 3b33248ac..000000000 --- a/test/vpto/cases/micro-op/compare-select/vselr-u8/kernel.pto +++ /dev/null @@ -1,54 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/compare-select/vselr-u8 -// family: compare-select -// target_ops: pto.vselr -// scenarios: core-u8, full-mask, explicit-lane-index -// ----------------------------------------------------------------------------- -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vselr_u8_kernel_2d(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c256 = arith.constant 256 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c4_i64 = arith.constant 4 : i64 - %c256_i64 = arith.constant 256 : i64 - %c1024_i64 = arith.constant 1024 : i64 - %c2048_i64 = arith.constant 2048 : i64 - %c1024_i32 = arith.constant 1024 : i32 - %false = arith.constant false - - %ub_src = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_idx = pto.castptr %c1024_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c2048_i64 : i64 -> !pto.ptr - pto.mte_gm_ub %arg0, %ub_src, %c0_i64, %c256_i64 - nburst(%c4_i64, %c256_i64, %c256_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - pto.mte_gm_ub %arg1, %ub_idx, %c0_i64, %c256_i64 - nburst(%c4_i64, %c256_i64, %c256_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %_:1 = scf.for %offset = %c0 to %c1024 step %c256 iter_args(%remaining = %c1024_i32) -> (i32) { - %mask, %next_remaining = pto.plt_b8 %remaining : i32 -> !pto.mask, i32 - %src = pto.vlds %ub_src[%offset] : !pto.ptr -> !pto.vreg<256xui8> - %idx = pto.vlds %ub_idx[%offset] : !pto.ptr -> !pto.vreg<256xui8> - %out = pto.vselr %src, %idx : !pto.vreg<256xui8>, !pto.vreg<256xui8> -> !pto.vreg<256xui8> - pto.vsts %out, %ub_out[%offset], %mask : !pto.vreg<256xui8>, !pto.ptr, !pto.mask - scf.yield %next_remaining : i32 - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg2, %c256_i64 - nburst(%c4_i64, %c256_i64, %c256_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/compare-select/vselr-u8/launch.cpp b/test/vpto/cases/micro-op/compare-select/vselr-u8/launch.cpp deleted file mode 100644 index a8d38e8ea..000000000 --- a/test/vpto/cases/micro-op/compare-select/vselr-u8/launch.cpp +++ /dev/null @@ -1,54 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/compare-select/vselr-u8 -// family: compare-select -// target_ops: pto.vselr -// scenarios: core-u8, full-mask, explicit-lane-index -// ----------------------------------------------------------------------------- -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include - -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vselr_u8_kernel_2d(__gm__ uint8_t *v1, - __gm__ uint8_t *v2, - __gm__ uint8_t *v3); - -void LaunchVselr_u8_kernel_2d(uint8_t *v1, uint8_t *v2, uint8_t *v3, - void *stream) { - vselr_u8_kernel_2d<<<1, nullptr, stream>>>((__gm__ uint8_t *)v1, - (__gm__ uint8_t *)v2, - (__gm__ uint8_t *)v3); -} diff --git a/test/vpto/cases/micro-op/compare-select/vselr-u8/main.cpp b/test/vpto/cases/micro-op/compare-select/vselr-u8/main.cpp deleted file mode 100644 index 78f0a8d16..000000000 --- a/test/vpto/cases/micro-op/compare-select/vselr-u8/main.cpp +++ /dev/null @@ -1,106 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/compare-select/vselr-u8 -// family: compare-select -// target_ops: pto.vselr -// scenarios: core-u8, full-mask, explicit-lane-index -// ----------------------------------------------------------------------------- -#include "test_common.h" -#include "acl/acl.h" -#include -#include -#include - -using namespace PtoTestCommon; - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, \ - (int)_ret, __FILE__, __LINE__); \ - const char *_recent = aclGetRecentErrMsg(); \ - if (_recent != nullptr && _recent[0] != '\0') \ - std::fprintf(stderr, "[ERROR] RecentErrMsg: %s\n", _recent); \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVselr_u8_kernel_2d(uint8_t *v1, uint8_t *v2, uint8_t *v3, - void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(uint8_t); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(uint8_t); - size_t elemCount_v3 = 1024; - size_t fileSize_v3 = elemCount_v3 * sizeof(uint8_t); - uint8_t *v1Host = nullptr; - uint8_t *v1Device = nullptr; - uint8_t *v2Host = nullptr; - uint8_t *v2Device = nullptr; - uint8_t *v3Host = nullptr; - uint8_t *v3Device = nullptr; - - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) - deviceId = std::atoi(envDevice); - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMallocHost((void **)(&v3Host), fileSize_v3)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v3Device, fileSize_v3, ACL_MEM_MALLOC_HUGE_FIRST)); - - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ReadFile("./v3.bin", fileSize_v3, v3Host, fileSize_v3); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v3Device, fileSize_v3, v3Host, fileSize_v3, - ACL_MEMCPY_HOST_TO_DEVICE)); - - LaunchVselr_u8_kernel_2d(v1Device, v2Device, v3Device, stream); - - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v3Host, fileSize_v3, v3Device, fileSize_v3, - ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v3.bin", v3Host, fileSize_v3); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFree(v3Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - aclrtFreeHost(v3Host); - if (stream != nullptr) - aclrtDestroyStream(stream); - if (deviceSet) - aclrtResetDevice(deviceId); - if (aclInited) - aclFinalize(); - return rc; -} diff --git a/test/vpto/cases/micro-op/compare-select/vselr/kernel.pto b/test/vpto/cases/micro-op/compare-select/vselr/kernel.pto index fb8b47ff0..c736a4221 100644 --- a/test/vpto/cases/micro-op/compare-select/vselr/kernel.pto +++ b/test/vpto/cases/micro-op/compare-select/vselr/kernel.pto @@ -1,74 +1,168 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/compare-select/vselr -// family: compare-select -// target_ops: pto.vselr -// scenarios: core-f32, full-mask, explicit-lane-index -// ----------------------------------------------------------------------------- module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vselr_kernel_2d(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr) attributes {pto.kernel} { - %c8192_i64 = arith.constant 8192 : i64 - %c4096_i64 = arith.constant 4096 : i64 - %c0_i64 = arith.constant 0 : i64 - %c32 = arith.constant 32 : index - %0 = pto.castptr %c0_i64 : i64 -> !pto.ptr - %1 = arith.index_castui %c32 : index to i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c4_i64 = arith.constant 4 : i64 - %2 = arith.muli %1, %c4_i64 : i64 - %c128_i64 = arith.constant 128 : i64 - %3 = pto.castptr %arg0 : !pto.ptr -> !pto.ptr - %4 = arith.index_castui %c0_i64 : i64 to index + func.func @vselr_deep_merged_kernel(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: !pto.ptr, %arg4: !pto.ptr, %arg5: !pto.ptr, %arg6: !pto.ptr, %arg7: !pto.ptr, %arg8: !pto.ptr) attributes {pto.kernel} { + %__deep_merge_guard = arith.constant false + // inactive merged from vselr_f16_kernel_2d + scf.if %__deep_merge_guard { + + %c0_m0 = arith.constant 0 : index + %c1_m0 = arith.constant 1 : index + %c128_m0 = arith.constant 128 : index + %c1024_m0 = arith.constant 1024 : index + %c0_i64_m0 = arith.constant 0 : i64 + %c1_i64_m0 = arith.constant 1 : i64 + %c8_i64_m0 = arith.constant 8 : i64 + %c256_i64_m0 = arith.constant 256 : i64 + %c2048_i64_m0 = arith.constant 2048 : i64 + %c4096_i64_m0 = arith.constant 4096 : i64 + %c1024_i32_m0 = arith.constant 1024 : i32 + %false_m0 = arith.constant false + + %ub_src_m0 = pto.castptr %c0_i64_m0 : i64 -> !pto.ptr + %ub_idx_m0 = pto.castptr %c2048_i64_m0 : i64 -> !pto.ptr + %ub_out_m0 = pto.castptr %c4096_i64_m0 : i64 -> !pto.ptr + pto.mte_gm_ub %arg0, %ub_src_m0, %c0_i64_m0, %c256_i64_m0 + nburst(%c8_i64_m0, %c256_i64_m0, %c256_i64_m0) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + pto.mte_gm_ub %arg1, %ub_idx_m0, %c0_i64_m0, %c256_i64_m0 + nburst(%c8_i64_m0, %c256_i64_m0, %c256_i64_m0) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %__m0:1 = scf.for %offset_m0 = %c0_m0 to %c1024_m0 step %c128_m0 iter_args(%remaining_m0 = %c1024_i32_m0) -> (i32) { + %mask_m0, %next_remaining_m0 = pto.plt_b16 %remaining_m0 : i32 -> !pto.mask, i32 + %src_m0 = pto.vlds %ub_src_m0[%offset_m0] : !pto.ptr -> !pto.vreg<128xf16> + %idx_m0 = pto.vlds %ub_idx_m0[%offset_m0] : !pto.ptr -> !pto.vreg<128xui16> + %out_m0 = pto.vselr %src_m0, %idx_m0 : !pto.vreg<128xf16>, !pto.vreg<128xui16> -> !pto.vreg<128xf16> + pto.vsts %out_m0, %ub_out_m0[%offset_m0], %mask_m0 : !pto.vreg<128xf16>, !pto.ptr, !pto.mask + scf.yield %next_remaining_m0 : i32 + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m0, %arg2, %c256_i64_m0 + nburst(%c8_i64_m0, %c256_i64_m0, %c256_i64_m0) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + + } + // inactive merged from vselr_u8_kernel_2d + scf.if %__deep_merge_guard { + + %c0_m1 = arith.constant 0 : index + %c1_m1 = arith.constant 1 : index + %c256_m1 = arith.constant 256 : index + %c1024_m1 = arith.constant 1024 : index + %c0_i64_m1 = arith.constant 0 : i64 + %c1_i64_m1 = arith.constant 1 : i64 + %c4_i64_m1 = arith.constant 4 : i64 + %c256_i64_m1 = arith.constant 256 : i64 + %c1024_i64_m1 = arith.constant 1024 : i64 + %c2048_i64_m1 = arith.constant 2048 : i64 + %c1024_i32_m1 = arith.constant 1024 : i32 + %false_m1 = arith.constant false + + %ub_src_m1 = pto.castptr %c0_i64_m1 : i64 -> !pto.ptr + %ub_idx_m1 = pto.castptr %c1024_i64_m1 : i64 -> !pto.ptr + %ub_out_m1 = pto.castptr %c2048_i64_m1 : i64 -> !pto.ptr + pto.mte_gm_ub %arg3, %ub_src_m1, %c0_i64_m1, %c256_i64_m1 + nburst(%c4_i64_m1, %c256_i64_m1, %c256_i64_m1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + pto.mte_gm_ub %arg4, %ub_idx_m1, %c0_i64_m1, %c256_i64_m1 + nburst(%c4_i64_m1, %c256_i64_m1, %c256_i64_m1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %__m1:1 = scf.for %offset_m1 = %c0_m1 to %c1024_m1 step %c256_m1 iter_args(%remaining_m1 = %c1024_i32_m1) -> (i32) { + %mask_m1, %next_remaining_m1 = pto.plt_b8 %remaining_m1 : i32 -> !pto.mask, i32 + %src_m1 = pto.vlds %ub_src_m1[%offset_m1] : !pto.ptr -> !pto.vreg<256xui8> + %idx_m1 = pto.vlds %ub_idx_m1[%offset_m1] : !pto.ptr -> !pto.vreg<256xui8> + %out_m1 = pto.vselr %src_m1, %idx_m1 : !pto.vreg<256xui8>, !pto.vreg<256xui8> -> !pto.vreg<256xui8> + pto.vsts %out_m1, %ub_out_m1[%offset_m1], %mask_m1 : !pto.vreg<256xui8>, !pto.ptr, !pto.mask + scf.yield %next_remaining_m1 : i32 + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m1, %arg5, %c256_i64_m1 + nburst(%c4_i64_m1, %c256_i64_m1, %c256_i64_m1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + + } + // active merged from vselr_kernel_2d + + %c8192_i64_m2 = arith.constant 8192 : i64 + %c4096_i64_m2 = arith.constant 4096 : i64 + %c0_i64_m2 = arith.constant 0 : i64 + %c32_m2 = arith.constant 32 : index + %0 = pto.castptr %c0_i64_m2 : i64 -> !pto.ptr + %1 = arith.index_castui %c32_m2 : index to i64 + %c1_i64_m2 = arith.constant 1 : i64 + %c32_i64_m2 = arith.constant 32 : i64 + %c4_i64_m2 = arith.constant 4 : i64 + %2 = arith.muli %1, %c4_i64_m2 : i64 + %c128_i64_m2 = arith.constant 128 : i64 + %3 = pto.castptr %arg6 : !pto.ptr -> !pto.ptr + %4 = arith.index_castui %c0_i64_m2 : i64 to index %5 = pto.addptr %3, %4 : -> - pto.set_loop2_stride_outtoub %c4096_i64, %c4096_i64 : i64, i64 - pto.set_loop1_stride_outtoub %c4096_i64, %c4096_i64 : i64, i64 + pto.set_loop2_stride_outtoub %c4096_i64_m2, %c4096_i64_m2 : i64, i64 + pto.set_loop1_stride_outtoub %c4096_i64_m2, %c4096_i64_m2 : i64, i64 %6 = pto.castptr %5 : !pto.ptr -> !pto.ptr - %false = arith.constant false - pto.mte_gm_ub %6, %0, %c0_i64, %2 - nburst(%c32_i64, %c128_i64, %c128_i64) + %false_m2 = arith.constant false + pto.mte_gm_ub %6, %0, %c0_i64_m2, %2 + nburst(%c32_i64_m2, %c128_i64_m2, %c128_i64_m2) : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - %7 = pto.castptr %c4096_i64 : i64 -> !pto.ptr - %8 = pto.castptr %arg1 : !pto.ptr -> !pto.ptr + %7 = pto.castptr %c4096_i64_m2 : i64 -> !pto.ptr + %8 = pto.castptr %arg7 : !pto.ptr -> !pto.ptr %9 = pto.addptr %8, %4 : -> - pto.set_loop2_stride_outtoub %c4096_i64, %c4096_i64 : i64, i64 - pto.set_loop1_stride_outtoub %c4096_i64, %c4096_i64 : i64, i64 + pto.set_loop2_stride_outtoub %c4096_i64_m2, %c4096_i64_m2 : i64, i64 + pto.set_loop1_stride_outtoub %c4096_i64_m2, %c4096_i64_m2 : i64, i64 %10 = pto.castptr %9 : !pto.ptr -> !pto.ptr - pto.mte_gm_ub %10, %7, %c0_i64, %2 - nburst(%c32_i64, %c128_i64, %c128_i64) + pto.mte_gm_ub %10, %7, %c0_i64_m2, %2 + nburst(%c32_i64_m2, %c128_i64_m2, %c128_i64_m2) : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - %11 = pto.castptr %c8192_i64 : i64 -> !pto.ptr - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c16 = arith.constant 16 : index - %c64 = arith.constant 64 : index - %c128 = arith.constant 128 : index - %c1024_i32 = arith.constant 1024 : i32 + %11 = pto.castptr %c8192_i64_m2 : i64 -> !pto.ptr + %c0_m2 = arith.constant 0 : index + %c1_m2 = arith.constant 1 : index + %c16_m2 = arith.constant 16 : index + %c64_m2 = arith.constant 64 : index + %c128_m2 = arith.constant 128 : index + %c1024_i32_m2 = arith.constant 1024 : i32 pto.vecscope { - %16 = scf.for %arg4 = %c0 to %c16 step %c1 iter_args(%arg5 = %c1024_i32) -> (i32) { - %17 = arith.muli %arg4, %c64 : index - %mask, %scalar_out = pto.plt_b32 %arg5 : i32 -> !pto.mask, i32 + %16 = scf.for %arg4_m2 = %c0_m2 to %c16_m2 step %c1_m2 iter_args(%arg5_m2 = %c1024_i32_m2) -> (i32) { + %17 = arith.muli %arg4_m2, %c64_m2 : index + %mask_m2, %scalar_out_m2 = pto.plt_b32 %arg5_m2 : i32 -> !pto.mask, i32 %25 = pto.vlds %0[%17] : !pto.ptr -> !pto.vreg<64xf32> %26 = pto.vlds %7[%17] : !pto.ptr -> !pto.vreg<64xi32> %27 = pto.vselr %25, %26 : !pto.vreg<64xf32>, !pto.vreg<64xi32> -> !pto.vreg<64xf32> - pto.vsts %27, %11[%17], %mask : !pto.vreg<64xf32>, !pto.ptr, !pto.mask - scf.yield %scalar_out : i32 + pto.vsts %27, %11[%17], %mask_m2 : !pto.vreg<64xf32>, !pto.ptr, !pto.mask + scf.yield %scalar_out_m2 : i32 } } pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - %c1024_i64 = arith.constant 1024 : i64 - %12 = arith.muli %1, %c4_i64 : i64 - %13 = pto.castptr %arg2 : !pto.ptr -> !pto.ptr + %c1024_i64_m2 = arith.constant 1024 : i64 + %12 = arith.muli %1, %c4_i64_m2 : i64 + %13 = pto.castptr %arg8 : !pto.ptr -> !pto.ptr %14 = pto.addptr %13, %4 : -> - pto.set_loop1_stride_ubtoout %c4096_i64, %c4096_i64 : i64, i64 - pto.set_loop2_stride_ubtoout %c4096_i64, %c4096_i64 : i64, i64 + pto.set_loop1_stride_ubtoout %c4096_i64_m2, %c4096_i64_m2 : i64, i64 + pto.set_loop2_stride_ubtoout %c4096_i64_m2, %c4096_i64_m2 : i64, i64 %15 = pto.castptr %14 : !pto.ptr -> !pto.ptr pto.mte_ub_gm %11, %15, %12 - nburst(%c32_i64, %c128_i64, %c128_i64) + nburst(%c32_i64_m2, %c128_i64_m2, %c128_i64_m2) : !pto.ptr, !pto.ptr, i64, i64, i64, i64 pto.barrier #pto.pipe + return } } diff --git a/test/vpto/cases/micro-op/compare-select/vselr/launch.cpp b/test/vpto/cases/micro-op/compare-select/vselr/launch.cpp index 68e4c6169..2dd059762 100644 --- a/test/vpto/cases/micro-op/compare-select/vselr/launch.cpp +++ b/test/vpto/cases/micro-op/compare-select/vselr/launch.cpp @@ -5,20 +5,9 @@ // THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, // INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. // See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/compare-select/vselr -// family: compare-select -// target_ops: pto.vselr -// scenarios: core-f32, full-mask, explicit-lane-index -// ----------------------------------------------------------------------------- -// --------------------------------------------------------------------------- -// PTOAS compatibility layer -// --------------------------------------------------------------------------- #ifndef __VEC_SCOPE__ #define __VEC_SCOPE__ #endif - #if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) typedef struct { unsigned char v; } hifloat8_t; typedef struct { unsigned char v; } float8_e4m3_t; @@ -28,30 +17,36 @@ typedef struct { unsigned char v; } float4_e1m2x2_t; typedef struct { unsigned char v; } float4_e2m1x2_t; #endif #include - #if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) #include #endif - #if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; +struct MrgSortExecutedNumList { uint16_t mrgSortList0,mrgSortList1,mrgSortList2,mrgSortList3; }; #endif #ifndef __CPU_SIM #include "acl/acl.h" #endif -extern "C" __global__ [aicore] void vselr_kernel_2d(__gm__ float *v1, - __gm__ int *v2, - __gm__ float *v3); +extern "C" __global__ [aicore] void vselr_deep_merged_kernel( + __gm__ half * arg0, + __gm__ uint16_t * arg1, + __gm__ half * arg2, + __gm__ uint8_t * arg3, + __gm__ uint8_t * arg4, + __gm__ uint8_t * arg5, + __gm__ float * arg6, + __gm__ int32_t * arg7, + __gm__ float * arg8); -void LaunchVselr_kernel_2d(float *v1, int *v2, float *v3, - void *stream) { - vselr_kernel_2d<<<1, nullptr, stream>>>((__gm__ float *)v1, - (__gm__ int *)v2, - (__gm__ float *)v3); +void LaunchVselrDeepMerged(float * p0, int * p1, float * p2, void *stream) { + vselr_deep_merged_kernel<<<1, nullptr, stream>>>( + (__gm__ half *)p0, + (__gm__ uint16_t *)p0, + (__gm__ half *)p0, + (__gm__ uint8_t *)p0, + (__gm__ uint8_t *)p0, + (__gm__ uint8_t *)p0, + (__gm__ float *)p0, + (__gm__ int32_t *)p1, + (__gm__ float *)p2); } diff --git a/test/vpto/cases/micro-op/compare-select/vselr/main.cpp b/test/vpto/cases/micro-op/compare-select/vselr/main.cpp index 62fd4bebf..6b4d9c0d6 100644 --- a/test/vpto/cases/micro-op/compare-select/vselr/main.cpp +++ b/test/vpto/cases/micro-op/compare-select/vselr/main.cpp @@ -37,9 +37,8 @@ using namespace PtoTestCommon; } \ } while (0) -void LaunchVselr_kernel_2d(float *v1, int *v2, float *v3, - void *stream); +void LaunchVselrDeepMerged(float * p0, int * p1, float * p2, void *stream); int main() { size_t elemCount_v1 = 1024; size_t fileSize_v1 = elemCount_v1 * sizeof(float); @@ -85,7 +84,7 @@ int main() { ACL_CHECK(aclrtMemcpy(v3Device, fileSize_v3, v3Host, fileSize_v3, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVselr_kernel_2d(v1Device, v2Device, v3Device, stream); + LaunchVselrDeepMerged(v1Device, v2Device, v3Device, stream); ACL_CHECK(aclrtSynchronizeStream(stream)); ACL_CHECK(aclrtMemcpy(v3Host, fileSize_v3, v3Device, fileSize_v3, diff --git a/test/vpto/cases/micro-op/conversion/vcvt-f16-to-f32-part-even/compare.py b/test/vpto/cases/micro-op/conversion/vcvt-f16-to-f32-part-even/compare.py deleted file mode 100755 index 751000b6f..000000000 --- a/test/vpto/cases/micro-op/conversion/vcvt-f16-to-f32-part-even/compare.py +++ /dev/null @@ -1,40 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - - -import os -import sys - -import numpy as np - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(golden_path) or not os.path.exists(output_path): - return False - golden = np.fromfile(golden_path, dtype=dtype) - output = np.fromfile(output_path, dtype=dtype) - return golden.shape == output.shape and np.allclose( - golden, output, atol=eps, rtol=eps, equal_nan=True - ) - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_bin("golden_v2.bin", "v2.bin", np.float32, 1e-4) - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/conversion/vcvt-f16-to-f32-part-even/golden.py b/test/vpto/cases/micro-op/conversion/vcvt-f16-to-f32-part-even/golden.py deleted file mode 100644 index e071074e7..000000000 --- a/test/vpto/cases/micro-op/conversion/vcvt-f16-to-f32-part-even/golden.py +++ /dev/null @@ -1,54 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/conversion/vcvt-f16-to-f32-part-even -# family: conversion -# target_ops: pto.vcvt -# scenarios: f16-to-f32, full-mask, part-even - -import argparse -from pathlib import Path - -import numpy as np - - -ELEMS = 1024 -SEED = 19 - - -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) - v1 = rng.uniform(-8.0, 8.0, size=ELEMS).astype(np.float16) - # Kernel writes 8 chunks (offset 0..448, step 64), each chunk converts the - # lower 16-bit half (PART_EVEN) from packed f16 pairs in a 128-lane load. - out_elems = 512 - v2 = np.zeros(out_elems, dtype=np.float32) - golden_v2 = np.empty(out_elems, dtype=np.float32) - for block in range(0, out_elems, 64): - src = v1[block : block + 128 : 2].astype(np.float32, copy=False) - golden_v2[block : block + 64] = src - - output_dir.mkdir(parents=True, exist_ok=True) - v1.tofile(output_dir / "v1.bin") - v2.tofile(output_dir / "v2.bin") - golden_v2.tofile(output_dir / "golden_v2.bin") - - -def main() -> None: - parser = argparse.ArgumentParser( - description="Generate numpy-based inputs/golden for VPTO micro-op vcvt-f16-to-f32 part-even validation." - ) - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/conversion/vcvt-f16-to-f32-part-even/kernel.pto b/test/vpto/cases/micro-op/conversion/vcvt-f16-to-f32-part-even/kernel.pto deleted file mode 100644 index 05742cdee..000000000 --- a/test/vpto/cases/micro-op/conversion/vcvt-f16-to-f32-part-even/kernel.pto +++ /dev/null @@ -1,47 +0,0 @@ -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vcvt_f16_to_f32_part_even_kernel_2d(%arg0: !pto.ptr, - %arg1: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c64 = arith.constant 64 : index - %c512 = arith.constant 512 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c16_i64 = arith.constant 16 : i64 - %c32_i64 = arith.constant 32 : i64 - %c64_i64 = arith.constant 64 : i64 - %c128_i64 = arith.constant 128 : i64 - %c2048_i64 = arith.constant 2048 : i64 - %ub_in = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c2048_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_in, %c0_i64, %c64_i64 - nburst(%c32_i64, %c64_i64, %c64_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %full_mask = pto.pset_b32 "PAT_ALL" : !pto.mask - %cvt_mask = pto.pset_b16 "PAT_ALL" : !pto.mask - // Use packed f16 load (no UNPK): PART_EVEN selects the lower 16-bit - // element from each f16 pair inside a b32 lane. - scf.for %offset = %c0 to %c512 step %c64 { - %loaded = pto.vlds %ub_in[%offset] : !pto.ptr -> !pto.vreg<128xf16> - %out = pto.vcvt %loaded, %cvt_mask {part = "EVEN"} : !pto.vreg<128xf16>, !pto.mask -> !pto.vreg<64xf32> - pto.vsts %out, %ub_out[%offset], %full_mask : !pto.vreg<64xf32>, !pto.ptr, !pto.mask - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg1, %c128_i64 - nburst(%c16_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/conversion/vcvt-f16-to-f32-part-even/launch.cpp b/test/vpto/cases/micro-op/conversion/vcvt-f16-to-f32-part-even/launch.cpp deleted file mode 100644 index 8e321d1a5..000000000 --- a/test/vpto/cases/micro-op/conversion/vcvt-f16-to-f32-part-even/launch.cpp +++ /dev/null @@ -1,71 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/conversion/vcvt-f16-to-f32-part-even -// family: conversion -// target_ops: pto.vcvt -// scenarios: f16-to-f32, full-mask, part-even -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -// --------------------------------------------------------------------------- -// PTOAS compatibility layer -// -// The upstream pto-isa headers reference some FP8/FP4 types and the -// __VEC_SCOPE__ marker that are not available on every AICore arch/toolchain -// combination (e.g. __NPU_ARCH__==2201). -// -// For our PTOAS-generated kernels we don't rely on these types today, but the -// headers still mention them in templates/static_asserts. Provide minimal -// fallbacks to keep compilation working on dav-c220. -// --------------------------------------------------------------------------- -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include - -// AICore printf support is gated behind `--cce-enable-print` on some -// toolchains. When enabled, include the CCE print header so `cce::printf` -// resolves in device compilation. -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -// Some PTO-ISA types are only available in the __CCE_AICORE__ compilation -// path, but `bisheng -xcce` still performs a host-side parse pass. -// Provide minimal fallbacks only when the corresponding header wasn't -// pulled in by the selected arch implementation. -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vcvt_f16_to_f32_part_even_kernel_2d(__gm__ half *v1, - __gm__ float *v2); - -void LaunchVcvt_f16_to_f32_part_even_kernel_2d(uint16_t *v1, float *v2, void *stream) { - vcvt_f16_to_f32_part_even_kernel_2d<<<1, nullptr, stream>>>((__gm__ half *)v1, - (__gm__ float *)v2); -} diff --git a/test/vpto/cases/micro-op/conversion/vcvt-f16-to-f32-part-even/main.cpp b/test/vpto/cases/micro-op/conversion/vcvt-f16-to-f32-part-even/main.cpp deleted file mode 100644 index 1925124c1..000000000 --- a/test/vpto/cases/micro-op/conversion/vcvt-f16-to-f32-part-even/main.cpp +++ /dev/null @@ -1,130 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/conversion/vcvt-f16-to-f32-part-even -// family: conversion -// target_ops: pto.vcvt -// scenarios: f16-to-f32, full-mask, part-even -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -This program is free software, you can redistribute it and/or modify it under the terms and conditions of -CANN Open Software License Agreement Version 2.0 (the "License"). -Please refer to the License for details. You may not use this file except in compliance with the License. -THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -See LICENSE in the root of the software repository for the full text of the License. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include -#include - -using namespace PtoTestCommon; - -#ifndef TMRGSORT_HPP -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, (int)_ret, __FILE__, __LINE__); \ - const char *_recent = aclGetRecentErrMsg(); \ - if (_recent != nullptr && _recent[0] != '\0') { \ - std::fprintf(stderr, "[ERROR] RecentErrMsg: %s\n", _recent); \ - } \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVcvt_f16_to_f32_part_even_kernel_2d(uint16_t *v1, float *v2, void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(uint16_t); - size_t elemCount_v2 = 512; - size_t fileSize_v2 = elemCount_v2 * sizeof(float); - uint16_t *v1Host = nullptr; - uint16_t *v1Device = nullptr; - float *v2Host = nullptr; - float *v2Device = nullptr; - - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { - deviceId = std::atoi(envDevice); - } - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVcvt_f16_to_f32_part_even_kernel_2d(v1Device, v2Device, stream); - - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v2Host, fileSize_v2, v2Device, fileSize_v2, ACL_MEMCPY_DEVICE_TO_HOST)); - - WriteFile("./v2.bin", v2Host, fileSize_v2); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - if (stream != nullptr) { - const aclError _ret = aclrtDestroyStream(stream); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtDestroyStream(stream)", (int)_ret, __FILE__, __LINE__); - } - stream = nullptr; - } - if (deviceSet) { - const aclError _ret = aclrtResetDevice(deviceId); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtResetDevice(deviceId)", (int)_ret, __FILE__, __LINE__); - } - } - if (aclInited) { - const aclError _ret = aclFinalize(); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclFinalize()", (int)_ret, __FILE__, __LINE__); - } - } - - return rc; -} diff --git a/test/vpto/cases/micro-op/conversion/vcvt-f16-to-f32-part-odd/compare.py b/test/vpto/cases/micro-op/conversion/vcvt-f16-to-f32-part-odd/compare.py deleted file mode 100755 index 751000b6f..000000000 --- a/test/vpto/cases/micro-op/conversion/vcvt-f16-to-f32-part-odd/compare.py +++ /dev/null @@ -1,40 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - - -import os -import sys - -import numpy as np - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(golden_path) or not os.path.exists(output_path): - return False - golden = np.fromfile(golden_path, dtype=dtype) - output = np.fromfile(output_path, dtype=dtype) - return golden.shape == output.shape and np.allclose( - golden, output, atol=eps, rtol=eps, equal_nan=True - ) - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_bin("golden_v2.bin", "v2.bin", np.float32, 1e-4) - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/conversion/vcvt-f16-to-f32-part-odd/golden.py b/test/vpto/cases/micro-op/conversion/vcvt-f16-to-f32-part-odd/golden.py deleted file mode 100644 index 2b7822bc8..000000000 --- a/test/vpto/cases/micro-op/conversion/vcvt-f16-to-f32-part-odd/golden.py +++ /dev/null @@ -1,54 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/conversion/vcvt-f16-to-f32-part-odd -# family: conversion -# target_ops: pto.vcvt -# scenarios: f16-to-f32, full-mask, part-odd - -import argparse -from pathlib import Path - -import numpy as np - - -ELEMS = 1024 -SEED = 19 - - -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) - v1 = rng.uniform(-8.0, 8.0, size=ELEMS).astype(np.float16) - # Kernel writes 8 chunks (offset 0..448, step 64), each chunk converts the - # upper 16-bit half (PART_ODD) from packed f16 pairs in a 128-lane load. - out_elems = 512 - v2 = np.zeros(out_elems, dtype=np.float32) - golden_v2 = np.empty(out_elems, dtype=np.float32) - for block in range(0, out_elems, 64): - src = v1[block + 1 : block + 128 : 2].astype(np.float32, copy=False) - golden_v2[block : block + 64] = src - - output_dir.mkdir(parents=True, exist_ok=True) - v1.tofile(output_dir / "v1.bin") - v2.tofile(output_dir / "v2.bin") - golden_v2.tofile(output_dir / "golden_v2.bin") - - -def main() -> None: - parser = argparse.ArgumentParser( - description="Generate numpy-based inputs/golden for VPTO micro-op vcvt-f16-to-f32 part-odd validation." - ) - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/conversion/vcvt-f16-to-f32-part-odd/kernel.pto b/test/vpto/cases/micro-op/conversion/vcvt-f16-to-f32-part-odd/kernel.pto deleted file mode 100644 index 29b2d7065..000000000 --- a/test/vpto/cases/micro-op/conversion/vcvt-f16-to-f32-part-odd/kernel.pto +++ /dev/null @@ -1,47 +0,0 @@ -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vcvt_f16_to_f32_part_odd_kernel_2d(%arg0: !pto.ptr, - %arg1: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c64 = arith.constant 64 : index - %c512 = arith.constant 512 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c16_i64 = arith.constant 16 : i64 - %c32_i64 = arith.constant 32 : i64 - %c64_i64 = arith.constant 64 : i64 - %c128_i64 = arith.constant 128 : i64 - %c2048_i64 = arith.constant 2048 : i64 - %ub_in = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c2048_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_in, %c0_i64, %c64_i64 - nburst(%c32_i64, %c64_i64, %c64_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %full_mask = pto.pset_b32 "PAT_ALL" : !pto.mask - %cvt_mask = pto.pset_b16 "PAT_ALL" : !pto.mask - // Use packed f16 load (no UNPK): PART_ODD then selects the upper 16-bit - // element from each f16 pair inside a b32 lane. - scf.for %offset = %c0 to %c512 step %c64 { - %loaded = pto.vlds %ub_in[%offset] : !pto.ptr -> !pto.vreg<128xf16> - %out = pto.vcvt %loaded, %cvt_mask {part = "ODD"} : !pto.vreg<128xf16>, !pto.mask -> !pto.vreg<64xf32> - pto.vsts %out, %ub_out[%offset], %full_mask : !pto.vreg<64xf32>, !pto.ptr, !pto.mask - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg1, %c128_i64 - nburst(%c16_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/conversion/vcvt-f16-to-f32-part-odd/launch.cpp b/test/vpto/cases/micro-op/conversion/vcvt-f16-to-f32-part-odd/launch.cpp deleted file mode 100644 index db23cbbf4..000000000 --- a/test/vpto/cases/micro-op/conversion/vcvt-f16-to-f32-part-odd/launch.cpp +++ /dev/null @@ -1,71 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/conversion/vcvt-f16-to-f32-part-odd -// family: conversion -// target_ops: pto.vcvt -// scenarios: f16-to-f32, full-mask, part-odd -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -// --------------------------------------------------------------------------- -// PTOAS compatibility layer -// -// The upstream pto-isa headers reference some FP8/FP4 types and the -// __VEC_SCOPE__ marker that are not available on every AICore arch/toolchain -// combination (e.g. __NPU_ARCH__==2201). -// -// For our PTOAS-generated kernels we don't rely on these types today, but the -// headers still mention them in templates/static_asserts. Provide minimal -// fallbacks to keep compilation working on dav-c220. -// --------------------------------------------------------------------------- -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include - -// AICore printf support is gated behind `--cce-enable-print` on some -// toolchains. When enabled, include the CCE print header so `cce::printf` -// resolves in device compilation. -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -// Some PTO-ISA types are only available in the __CCE_AICORE__ compilation -// path, but `bisheng -xcce` still performs a host-side parse pass. -// Provide minimal fallbacks only when the corresponding header wasn't -// pulled in by the selected arch implementation. -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vcvt_f16_to_f32_part_odd_kernel_2d(__gm__ half *v1, - __gm__ float *v2); - -void LaunchVcvt_f16_to_f32_part_odd_kernel_2d(uint16_t *v1, float *v2, void *stream) { - vcvt_f16_to_f32_part_odd_kernel_2d<<<1, nullptr, stream>>>((__gm__ half *)v1, - (__gm__ float *)v2); -} diff --git a/test/vpto/cases/micro-op/conversion/vcvt-f16-to-f32-part-odd/main.cpp b/test/vpto/cases/micro-op/conversion/vcvt-f16-to-f32-part-odd/main.cpp deleted file mode 100644 index 567aafa0a..000000000 --- a/test/vpto/cases/micro-op/conversion/vcvt-f16-to-f32-part-odd/main.cpp +++ /dev/null @@ -1,130 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/conversion/vcvt-f16-to-f32-part-odd -// family: conversion -// target_ops: pto.vcvt -// scenarios: f16-to-f32, full-mask, part-odd -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -This program is free software, you can redistribute it and/or modify it under the terms and conditions of -CANN Open Software License Agreement Version 2.0 (the "License"). -Please refer to the License for details. You may not use this file except in compliance with the License. -THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -See LICENSE in the root of the software repository for the full text of the License. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include -#include - -using namespace PtoTestCommon; - -#ifndef TMRGSORT_HPP -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, (int)_ret, __FILE__, __LINE__); \ - const char *_recent = aclGetRecentErrMsg(); \ - if (_recent != nullptr && _recent[0] != '\0') { \ - std::fprintf(stderr, "[ERROR] RecentErrMsg: %s\n", _recent); \ - } \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVcvt_f16_to_f32_part_odd_kernel_2d(uint16_t *v1, float *v2, void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(uint16_t); - size_t elemCount_v2 = 512; - size_t fileSize_v2 = elemCount_v2 * sizeof(float); - uint16_t *v1Host = nullptr; - uint16_t *v1Device = nullptr; - float *v2Host = nullptr; - float *v2Device = nullptr; - - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { - deviceId = std::atoi(envDevice); - } - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVcvt_f16_to_f32_part_odd_kernel_2d(v1Device, v2Device, stream); - - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v2Host, fileSize_v2, v2Device, fileSize_v2, ACL_MEMCPY_DEVICE_TO_HOST)); - - WriteFile("./v2.bin", v2Host, fileSize_v2); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - if (stream != nullptr) { - const aclError _ret = aclrtDestroyStream(stream); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtDestroyStream(stream)", (int)_ret, __FILE__, __LINE__); - } - stream = nullptr; - } - if (deviceSet) { - const aclError _ret = aclrtResetDevice(deviceId); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtResetDevice(deviceId)", (int)_ret, __FILE__, __LINE__); - } - } - if (aclInited) { - const aclError _ret = aclFinalize(); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclFinalize()", (int)_ret, __FILE__, __LINE__); - } - } - - return rc; -} diff --git a/test/vpto/cases/micro-op/conversion/vcvt-f16-to-f32/kernel.pto b/test/vpto/cases/micro-op/conversion/vcvt-f16-to-f32/kernel.pto index 5d57a648e..5744f42fb 100644 --- a/test/vpto/cases/micro-op/conversion/vcvt-f16-to-f32/kernel.pto +++ b/test/vpto/cases/micro-op/conversion/vcvt-f16-to-f32/kernel.pto @@ -1,43 +1,138 @@ module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vcvt_f16_to_f32_kernel_2d(%arg0: !pto.ptr, - %arg1: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c64 = arith.constant 64 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c64_i64 = arith.constant 64 : i64 - %c128_i64 = arith.constant 128 : i64 - %c2048_i64 = arith.constant 2048 : i64 - %ub_in = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c2048_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_in, %c0_i64, %c64_i64 - nburst(%c32_i64, %c64_i64, %c64_i64) + func.func @vcvt_f16_to_f32_deep_merged_kernel(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: !pto.ptr, %arg4: !pto.ptr, %arg5: !pto.ptr) attributes {pto.kernel} { + %__deep_merge_guard = arith.constant false + // inactive merged from vcvt_f16_to_f32_part_even_kernel_2d + scf.if %__deep_merge_guard { + + %c0_m0 = arith.constant 0 : index + %c1_m0 = arith.constant 1 : index + %c64_m0 = arith.constant 64 : index + %c512_m0 = arith.constant 512 : index + %c1024_m0 = arith.constant 1024 : index + %c0_i64_m0 = arith.constant 0 : i64 + %c1_i64_m0 = arith.constant 1 : i64 + %c16_i64_m0 = arith.constant 16 : i64 + %c32_i64_m0 = arith.constant 32 : i64 + %c64_i64_m0 = arith.constant 64 : i64 + %c128_i64_m0 = arith.constant 128 : i64 + %c2048_i64_m0 = arith.constant 2048 : i64 + %ub_in_m0 = pto.castptr %c0_i64_m0 : i64 -> !pto.ptr + %ub_out_m0 = pto.castptr %c2048_i64_m0 : i64 -> !pto.ptr + + %false_m0 = arith.constant false + pto.mte_gm_ub %arg0, %ub_in_m0, %c0_i64_m0, %c64_i64_m0 + nburst(%c32_i64_m0, %c64_i64_m0, %c64_i64_m0) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %full_mask_m0 = pto.pset_b32 "PAT_ALL" : !pto.mask + %cvt_mask_m0 = pto.pset_b16 "PAT_ALL" : !pto.mask + // Use packed f16 load (no UNPK): PART_EVEN selects the lower 16-bit + // element from each f16 pair inside a b32 lane. + scf.for %offset_m0 = %c0_m0 to %c512_m0 step %c64_m0 { + %loaded_m0 = pto.vlds %ub_in_m0[%offset_m0] : !pto.ptr -> !pto.vreg<128xf16> + %out_m0 = pto.vcvt %loaded_m0, %cvt_mask_m0 {part = "EVEN"} : !pto.vreg<128xf16>, !pto.mask -> !pto.vreg<64xf32> + pto.vsts %out_m0, %ub_out_m0[%offset_m0], %full_mask_m0 : !pto.vreg<64xf32>, !pto.ptr, !pto.mask + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m0, %arg1, %c128_i64_m0 + nburst(%c16_i64_m0, %c128_i64_m0, %c128_i64_m0) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + + } + // inactive merged from vcvt_f16_to_f32_part_odd_kernel_2d + scf.if %__deep_merge_guard { + + %c0_m1 = arith.constant 0 : index + %c1_m1 = arith.constant 1 : index + %c64_m1 = arith.constant 64 : index + %c512_m1 = arith.constant 512 : index + %c1024_m1 = arith.constant 1024 : index + %c0_i64_m1 = arith.constant 0 : i64 + %c1_i64_m1 = arith.constant 1 : i64 + %c16_i64_m1 = arith.constant 16 : i64 + %c32_i64_m1 = arith.constant 32 : i64 + %c64_i64_m1 = arith.constant 64 : i64 + %c128_i64_m1 = arith.constant 128 : i64 + %c2048_i64_m1 = arith.constant 2048 : i64 + %ub_in_m1 = pto.castptr %c0_i64_m1 : i64 -> !pto.ptr + %ub_out_m1 = pto.castptr %c2048_i64_m1 : i64 -> !pto.ptr + + %false_m1 = arith.constant false + pto.mte_gm_ub %arg2, %ub_in_m1, %c0_i64_m1, %c64_i64_m1 + nburst(%c32_i64_m1, %c64_i64_m1, %c64_i64_m1) : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] pto.vecscope { - %full_mask = pto.pset_b32 "PAT_ALL" : !pto.mask - %cvt_mask = pto.pset_b16 "PAT_ALL" : !pto.mask - scf.for %offset = %c0 to %c1024 step %c64 { - %loaded = pto.vlds %ub_in[%offset] {dist = "UNPK_B16"} : !pto.ptr -> !pto.vreg<128xf16> - %out = pto.vcvt %loaded, %cvt_mask {part = "EVEN"} : !pto.vreg<128xf16>, !pto.mask -> !pto.vreg<64xf32> - pto.vsts %out, %ub_out[%offset], %full_mask : !pto.vreg<64xf32>, !pto.ptr, !pto.mask + %full_mask_m1 = pto.pset_b32 "PAT_ALL" : !pto.mask + %cvt_mask_m1 = pto.pset_b16 "PAT_ALL" : !pto.mask + // Use packed f16 load (no UNPK): PART_ODD then selects the upper 16-bit + // element from each f16 pair inside a b32 lane. + scf.for %offset_m1 = %c0_m1 to %c512_m1 step %c64_m1 { + %loaded_m1 = pto.vlds %ub_in_m1[%offset_m1] : !pto.ptr -> !pto.vreg<128xf16> + %out_m1 = pto.vcvt %loaded_m1, %cvt_mask_m1 {part = "ODD"} : !pto.vreg<128xf16>, !pto.mask -> !pto.vreg<64xf32> + pto.vsts %out_m1, %ub_out_m1[%offset_m1], %full_mask_m1 : !pto.vreg<64xf32>, !pto.ptr, !pto.mask } } pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg1, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) + pto.mte_ub_gm %ub_out_m1, %arg3, %c128_i64_m1 + nburst(%c16_i64_m1, %c128_i64_m1, %c128_i64_m1) : !pto.ptr, !pto.ptr, i64, i64, i64, i64 pto.barrier #pto.pipe + + } + // active merged from vcvt_f16_to_f32_kernel_2d + + %c0_m2 = arith.constant 0 : index + %c1_m2 = arith.constant 1 : index + %c64_m2 = arith.constant 64 : index + %c1024_m2 = arith.constant 1024 : index + %c0_i64_m2 = arith.constant 0 : i64 + %c1_i64_m2 = arith.constant 1 : i64 + %c32_i64_m2 = arith.constant 32 : i64 + %c64_i64_m2 = arith.constant 64 : i64 + %c128_i64_m2 = arith.constant 128 : i64 + %c2048_i64_m2 = arith.constant 2048 : i64 + %ub_in_m2 = pto.castptr %c0_i64_m2 : i64 -> !pto.ptr + %ub_out_m2 = pto.castptr %c2048_i64_m2 : i64 -> !pto.ptr + + %false_m2 = arith.constant false + pto.mte_gm_ub %arg4, %ub_in_m2, %c0_i64_m2, %c64_i64_m2 + nburst(%c32_i64_m2, %c64_i64_m2, %c64_i64_m2) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %full_mask_m2 = pto.pset_b32 "PAT_ALL" : !pto.mask + %cvt_mask_m2 = pto.pset_b16 "PAT_ALL" : !pto.mask + scf.for %offset_m2 = %c0_m2 to %c1024_m2 step %c64_m2 { + %loaded_m2 = pto.vlds %ub_in_m2[%offset_m2] {dist = "UNPK_B16"} : !pto.ptr -> !pto.vreg<128xf16> + %out_m2 = pto.vcvt %loaded_m2, %cvt_mask_m2 {part = "EVEN"} : !pto.vreg<128xf16>, !pto.mask -> !pto.vreg<64xf32> + pto.vsts %out_m2, %ub_out_m2[%offset_m2], %full_mask_m2 : !pto.vreg<64xf32>, !pto.ptr, !pto.mask + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m2, %arg5, %c128_i64_m2 + nburst(%c32_i64_m2, %c128_i64_m2, %c128_i64_m2) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + return } } diff --git a/test/vpto/cases/micro-op/conversion/vcvt-f16-to-f32/launch.cpp b/test/vpto/cases/micro-op/conversion/vcvt-f16-to-f32/launch.cpp index 4998ce110..a0a4788c6 100644 --- a/test/vpto/cases/micro-op/conversion/vcvt-f16-to-f32/launch.cpp +++ b/test/vpto/cases/micro-op/conversion/vcvt-f16-to-f32/launch.cpp @@ -5,30 +5,9 @@ // THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, // INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. // See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/conversion/vcvt-f16-to-f32 -// family: conversion -// target_ops: pto.vcvt -// scenarios: f16-to-f32, full-mask -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -// --------------------------------------------------------------------------- -// PTOAS compatibility layer -// -// The upstream pto-isa headers reference some FP8/FP4 types and the -// __VEC_SCOPE__ marker that are not available on every AICore arch/toolchain -// combination (e.g. __NPU_ARCH__==2201). -// -// For our PTOAS-generated kernels we don't rely on these types today, but the -// headers still mention them in templates/static_asserts. Provide minimal -// fallbacks to keep compilation working on dav-c220. -// --------------------------------------------------------------------------- #ifndef __VEC_SCOPE__ #define __VEC_SCOPE__ #endif - #if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) typedef struct { unsigned char v; } hifloat8_t; typedef struct { unsigned char v; } float8_e4m3_t; @@ -38,34 +17,30 @@ typedef struct { unsigned char v; } float4_e1m2x2_t; typedef struct { unsigned char v; } float4_e2m1x2_t; #endif #include - -// AICore printf support is gated behind `--cce-enable-print` on some -// toolchains. When enabled, include the CCE print header so `cce::printf` -// resolves in device compilation. #if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) #include #endif - -// Some PTO-ISA types are only available in the __CCE_AICORE__ compilation -// path, but `bisheng -xcce` still performs a host-side parse pass. -// Provide minimal fallbacks only when the corresponding header wasn't -// pulled in by the selected arch implementation. #if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; +struct MrgSortExecutedNumList { uint16_t mrgSortList0,mrgSortList1,mrgSortList2,mrgSortList3; }; #endif #ifndef __CPU_SIM #include "acl/acl.h" #endif -extern "C" __global__ [aicore] void vcvt_f16_to_f32_kernel_2d(__gm__ half *v1, - __gm__ float *v2); +extern "C" __global__ [aicore] void vcvt_f16_to_f32_deep_merged_kernel( + __gm__ half * arg0, + __gm__ float * arg1, + __gm__ half * arg2, + __gm__ float * arg3, + __gm__ half * arg4, + __gm__ float * arg5); -void LaunchVcvt_f16_to_f32_kernel_2d(uint16_t *v1, float *v2, void *stream) { - vcvt_f16_to_f32_kernel_2d<<<1, nullptr, stream>>>((__gm__ half *)v1, - (__gm__ float *)v2); +void LaunchVcvtF16ToF32DeepMerged(uint16_t * p0, float * p1, void *stream) { + vcvt_f16_to_f32_deep_merged_kernel<<<1, nullptr, stream>>>( + (__gm__ half *)p0, + (__gm__ float *)p0, + (__gm__ half *)p0, + (__gm__ float *)p0, + (__gm__ half *)p0, + (__gm__ float *)p1); } diff --git a/test/vpto/cases/micro-op/conversion/vcvt-f16-to-f32/main.cpp b/test/vpto/cases/micro-op/conversion/vcvt-f16-to-f32/main.cpp index 17f92c862..f690aa532 100644 --- a/test/vpto/cases/micro-op/conversion/vcvt-f16-to-f32/main.cpp +++ b/test/vpto/cases/micro-op/conversion/vcvt-f16-to-f32/main.cpp @@ -55,8 +55,8 @@ struct MrgSortExecutedNumList { } \ } while (0) -void LaunchVcvt_f16_to_f32_kernel_2d(uint16_t *v1, float *v2, void *stream); +void LaunchVcvtF16ToF32DeepMerged(uint16_t * p0, float * p1, void *stream); int main() { size_t elemCount_v1 = 1024; size_t fileSize_v1 = elemCount_v1 * sizeof(uint16_t); @@ -91,7 +91,7 @@ int main() { ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, ACL_MEMCPY_HOST_TO_DEVICE)); ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVcvt_f16_to_f32_kernel_2d(v1Device, v2Device, stream); + LaunchVcvtF16ToF32DeepMerged(v1Device, v2Device, stream); ACL_CHECK(aclrtSynchronizeStream(stream)); ACL_CHECK(aclrtMemcpy(v2Host, fileSize_v2, v2Device, fileSize_v2, ACL_MEMCPY_DEVICE_TO_HOST)); diff --git a/test/vpto/cases/micro-op/conversion/vcvt-f32-to-f16-pk-b32/compare.py b/test/vpto/cases/micro-op/conversion/vcvt-f32-to-f16-pk-b32/compare.py deleted file mode 100644 index d2d022505..000000000 --- a/test/vpto/cases/micro-op/conversion/vcvt-f32-to-f16-pk-b32/compare.py +++ /dev/null @@ -1,40 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - - -import os -import sys - -import numpy as np - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(golden_path) or not os.path.exists(output_path): - return False - golden = np.fromfile(golden_path, dtype=dtype) - output = np.fromfile(output_path, dtype=dtype) - return golden.shape == output.shape and np.allclose( - golden, output, atol=eps, rtol=eps, equal_nan=True - ) - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_bin("golden_v2.bin", "v2.bin", np.float16, 1e-3) - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/conversion/vcvt-f32-to-f16-pk-b32/golden.py b/test/vpto/cases/micro-op/conversion/vcvt-f32-to-f16-pk-b32/golden.py deleted file mode 100644 index ee8fd3890..000000000 --- a/test/vpto/cases/micro-op/conversion/vcvt-f32-to-f16-pk-b32/golden.py +++ /dev/null @@ -1,48 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/conversion/vcvt-f32-to-f16-pk-b32 -# family: conversion -# target_ops: pto.vcvt, pto.vsts -# scenarios: f32-to-f16, pk-b32-store, full-mask - -import argparse -from pathlib import Path - -import numpy as np - - -ELEMS = 1024 -SEED = 19 - - -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) - v1 = rng.uniform(-8.0, 8.0, size=ELEMS).astype(np.float32) - v2 = np.zeros(ELEMS, dtype=np.float16) - golden_v2 = v1.astype(np.float16) - - output_dir.mkdir(parents=True, exist_ok=True) - v1.tofile(output_dir / "v1.bin") - v2.tofile(output_dir / "v2.bin") - golden_v2.tofile(output_dir / "golden_v2.bin") - - -def main() -> None: - parser = argparse.ArgumentParser( - description="Generate numpy-based inputs/golden for VPTO micro-op vcvt-f32-to-f16-pk-b32 validation." - ) - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/conversion/vcvt-f32-to-f16-pk-b32/kernel.pto b/test/vpto/cases/micro-op/conversion/vcvt-f32-to-f16-pk-b32/kernel.pto deleted file mode 100644 index 2e12d41be..000000000 --- a/test/vpto/cases/micro-op/conversion/vcvt-f32-to-f16-pk-b32/kernel.pto +++ /dev/null @@ -1,48 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/conversion/vcvt-f32-to-f16-pk-b32 -// family: conversion -// target_ops: pto.vcvt, pto.vsts -// scenarios: f32-to-f16, pk-b32-store, full-mask -// ----------------------------------------------------------------------------- -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vcvt_f32_to_f16_pk_b32_kernel(%arg0: !pto.ptr, - %arg1: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c64 = arith.constant 64 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c64_i64 = arith.constant 64 : i64 - %c128_i64 = arith.constant 128 : i64 - %c4096_i64 = arith.constant 4096 : i64 - - %ub_in = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c4096_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_in, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %mask = pto.pset_b32 "PAT_ALL" : !pto.mask - scf.for %offset = %c0 to %c1024 step %c64 { - %loaded = pto.vlds %ub_in[%offset] : !pto.ptr -> !pto.vreg<64xf32> - %converted = pto.vcvt %loaded, %mask {rnd = "R", sat = "SAT", part = "EVEN"} : !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<128xf16> - pto.vsts %converted, %ub_out[%offset], %mask {dist = "PK_B32"} : !pto.vreg<128xf16>, !pto.ptr, !pto.mask - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg1, %c64_i64 - nburst(%c32_i64, %c64_i64, %c64_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/conversion/vcvt-f32-to-f16-pk-b32/launch.cpp b/test/vpto/cases/micro-op/conversion/vcvt-f32-to-f16-pk-b32/launch.cpp deleted file mode 100644 index 77055836f..000000000 --- a/test/vpto/cases/micro-op/conversion/vcvt-f32-to-f16-pk-b32/launch.cpp +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include - -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vcvt_f32_to_f16_pk_b32_kernel(__gm__ float *v1, - __gm__ half *v2); - -void LaunchVcvt_f32_to_f16_pk_b32_kernel(float *v1, aclFloat16 *v2, void *stream) { - vcvt_f32_to_f16_pk_b32_kernel<<<1, nullptr, stream>>>((__gm__ float *)v1, (__gm__ half *)v2); -} diff --git a/test/vpto/cases/micro-op/conversion/vcvt-f32-to-f16-pk-b32/main.cpp b/test/vpto/cases/micro-op/conversion/vcvt-f32-to-f16-pk-b32/main.cpp deleted file mode 100644 index 8b7886671..000000000 --- a/test/vpto/cases/micro-op/conversion/vcvt-f32-to-f16-pk-b32/main.cpp +++ /dev/null @@ -1,122 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -This program is free software, you can redistribute it and/or modify it under the terms and conditions of -CANN Open Software License Agreement Version 2.0 (the "License"). -Please refer to the License for details. You may not use this file except in compliance with the License. -THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -See LICENSE in the root of the software repository for the full text of the License. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include -#include - -using namespace PtoTestCommon; - -#ifndef TMRGSORT_HPP -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, (int)_ret, __FILE__, __LINE__); \ - const char *_recent = aclGetRecentErrMsg(); \ - if (_recent != nullptr && _recent[0] != '\0') { \ - std::fprintf(stderr, "[ERROR] RecentErrMsg: %s\n", _recent); \ - } \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVcvt_f32_to_f16_pk_b32_kernel(float *v1, aclFloat16 *v2, void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(float); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(aclFloat16); - float *v1Host = nullptr; - float *v1Device = nullptr; - aclFloat16 *v2Host = nullptr; - aclFloat16 *v2Device = nullptr; - - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { - deviceId = std::atoi(envDevice); - } - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVcvt_f32_to_f16_pk_b32_kernel(v1Device, v2Device, stream); - - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v2Host, fileSize_v2, v2Device, fileSize_v2, ACL_MEMCPY_DEVICE_TO_HOST)); - - WriteFile("./v2.bin", v2Host, fileSize_v2); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - if (stream != nullptr) { - const aclError _ret = aclrtDestroyStream(stream); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtDestroyStream(stream)", (int)_ret, __FILE__, __LINE__); - } - stream = nullptr; - } - if (deviceSet) { - const aclError _ret = aclrtResetDevice(deviceId); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtResetDevice(deviceId)", (int)_ret, __FILE__, __LINE__); - } - } - if (aclInited) { - const aclError _ret = aclFinalize(); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclFinalize()", (int)_ret, __FILE__, __LINE__); - } - } - - return rc; -} diff --git a/test/vpto/cases/micro-op/conversion/vcvt-f32-to-f16/kernel.pto b/test/vpto/cases/micro-op/conversion/vcvt-f32-to-f16/kernel.pto index 01ab9c588..e78d165de 100644 --- a/test/vpto/cases/micro-op/conversion/vcvt-f32-to-f16/kernel.pto +++ b/test/vpto/cases/micro-op/conversion/vcvt-f32-to-f16/kernel.pto @@ -1,52 +1,202 @@ module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vcvt_f32_to_f16_kernel_2d(%arg0: !pto.ptr, - %arg1: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c64 = arith.constant 64 : index - %c128 = arith.constant 128 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c64_i64 = arith.constant 64 : i64 - %c128_i64 = arith.constant 128 : i64 - %c4096_i64 = arith.constant 4096 : i64 - %c1024_i32 = arith.constant 1024 : i32 - - %ub_in = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c4096_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_in, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) + func.func @vcvt_f32_to_f16_deep_merged_kernel(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: !pto.ptr) attributes {pto.kernel} { + %__deep_merge_guard = arith.constant false + // inactive merged from vcvt_f32_to_f16_pk_b32_kernel + scf.if %__deep_merge_guard { + + %c0_m0 = arith.constant 0 : index + %c64_m0 = arith.constant 64 : index + %c1024_m0 = arith.constant 1024 : index + %c0_i64_m0 = arith.constant 0 : i64 + %c1_i64_m0 = arith.constant 1 : i64 + %c32_i64_m0 = arith.constant 32 : i64 + %c64_i64_m0 = arith.constant 64 : i64 + %c128_i64_m0 = arith.constant 128 : i64 + %c4096_i64_m0 = arith.constant 4096 : i64 + + %ub_in_m0 = pto.castptr %c0_i64_m0 : i64 -> !pto.ptr + %ub_out_m0 = pto.castptr %c4096_i64_m0 : i64 -> !pto.ptr + + %false_m0 = arith.constant false + pto.mte_gm_ub %arg0, %ub_in_m0, %c0_i64_m0, %c128_i64_m0 + nburst(%c32_i64_m0, %c128_i64_m0, %c128_i64_m0) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %mask_m0 = pto.pset_b32 "PAT_ALL" : !pto.mask + scf.for %offset_m0 = %c0_m0 to %c1024_m0 step %c64_m0 { + %loaded_m0 = pto.vlds %ub_in_m0[%offset_m0] : !pto.ptr -> !pto.vreg<64xf32> + %converted_m0 = pto.vcvt %loaded_m0, %mask_m0 {rnd = "R", sat = "SAT", part = "EVEN"} : !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<128xf16> + pto.vsts %converted_m0, %ub_out_m0[%offset_m0], %mask_m0 {dist = "PK_B32"} : !pto.vreg<128xf16>, !pto.ptr, !pto.mask + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m0, %arg1, %c64_i64_m0 + nburst(%c32_i64_m0, %c64_i64_m0, %c64_i64_m0) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + + } + // active merged from vcvt_f32_to_f16_kernel_2d + + %c0_m1 = arith.constant 0 : index + %c1_m1 = arith.constant 1 : index + %c64_m1 = arith.constant 64 : index + %c128_m1 = arith.constant 128 : index + %c1024_m1 = arith.constant 1024 : index + %c0_i64_m1 = arith.constant 0 : i64 + %c1_i64_m1 = arith.constant 1 : i64 + %c32_i64_m1 = arith.constant 32 : i64 + %c64_i64_m1 = arith.constant 64 : i64 + %c128_i64_m1 = arith.constant 128 : i64 + %c4096_i64_m1 = arith.constant 4096 : i64 + %c1024_i32_m1 = arith.constant 1024 : i32 + + %ub_in_m1 = pto.castptr %c0_i64_m1 : i64 -> !pto.ptr + %ub_out_m1 = pto.castptr %c4096_i64_m1 : i64 -> !pto.ptr + + %false_m1 = arith.constant false + pto.mte_gm_ub %arg2, %ub_in_m1, %c0_i64_m1, %c128_i64_m1 + nburst(%c32_i64_m1, %c128_i64_m1, %c128_i64_m1) : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] pto.vecscope { - %_:1 = scf.for %offset = %c0 to %c1024 step %c128 iter_args(%remaining = %c1024_i32) -> (i32) { - %mask, %next_remaining = pto.plt_b16 %remaining : i32 -> !pto.mask, i32 - %lower_mask, %upper_remaining = pto.plt_b32 %remaining : i32 -> !pto.mask, i32 - %upper_mask, %_next_b32 = pto.plt_b32 %upper_remaining : i32 -> !pto.mask, i32 - %upper_offset = arith.addi %offset, %c64 : index - %lower = pto.vlds %ub_in[%offset] : !pto.ptr -> !pto.vreg<64xf32> - %upper = pto.vlds %ub_in[%upper_offset] : !pto.ptr -> !pto.vreg<64xf32> - %even = pto.vcvt %lower, %lower_mask {rnd = "R", sat = "SAT", part = "EVEN"} : !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<128xf16> - %odd = pto.vcvt %upper, %upper_mask {rnd = "R", sat = "SAT", part = "ODD"} : !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<128xf16> - %merged = pto.vor %even, %odd, %mask : !pto.vreg<128xf16>, !pto.vreg<128xf16>, !pto.mask -> !pto.vreg<128xf16> - pto.vsts %merged, %ub_out[%offset], %mask : !pto.vreg<128xf16>, !pto.ptr, !pto.mask - scf.yield %next_remaining : i32 + %__m1:1 = scf.for %offset_m1 = %c0_m1 to %c1024_m1 step %c128_m1 iter_args(%remaining_m1 = %c1024_i32_m1) -> (i32) { + %mask_m1, %next_remaining_m1 = pto.plt_b16 %remaining_m1 : i32 -> !pto.mask, i32 + %lower_mask_m1, %upper_remaining_m1 = pto.plt_b32 %remaining_m1 : i32 -> !pto.mask, i32 + %upper_mask_m1, %_next_b32_m1 = pto.plt_b32 %upper_remaining_m1 : i32 -> !pto.mask, i32 + %upper_offset_m1 = arith.addi %offset_m1, %c64_m1 : index + %lower_m1 = pto.vlds %ub_in_m1[%offset_m1] : !pto.ptr -> !pto.vreg<64xf32> + %upper_m1 = pto.vlds %ub_in_m1[%upper_offset_m1] : !pto.ptr -> !pto.vreg<64xf32> + %even_m1 = pto.vcvt %lower_m1, %lower_mask_m1 {rnd = "R", sat = "SAT", part = "EVEN"} : !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<128xf16> + %odd_m1 = pto.vcvt %upper_m1, %upper_mask_m1 {rnd = "R", sat = "SAT", part = "ODD"} : !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<128xf16> + %merged_m1 = pto.vor %even_m1, %odd_m1, %mask_m1 : !pto.vreg<128xf16>, !pto.vreg<128xf16>, !pto.mask -> !pto.vreg<128xf16> + pto.vsts %merged_m1, %ub_out_m1[%offset_m1], %mask_m1 : !pto.vreg<128xf16>, !pto.ptr, !pto.mask + scf.yield %next_remaining_m1 : i32 } } pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg1, %c64_i64 - nburst(%c32_i64, %c64_i64, %c64_i64) + pto.mte_ub_gm %ub_out_m1, %arg3, %c64_i64_m1 + nburst(%c32_i64_m1, %c64_i64_m1, %c64_i64_m1) : !pto.ptr, !pto.ptr, i64, i64, i64, i64 pto.barrier #pto.pipe + %__case_merge_guard = arith.constant false + + // inactive cross-case merge from micro-op/conversion/vcvt-tail + scf.if %__case_merge_guard { + + %__deep_merge_guard_cmg5_1 = arith.constant false + // inactive merged from vcvt_tail_special_kernel_2d + scf.if %__deep_merge_guard_cmg5_1 { + + %c0_m0_cmg5_1 = arith.constant 0 : index + %c1_m0_cmg5_1 = arith.constant 1 : index + %c64_m0_cmg5_1 = arith.constant 64 : index + %c128_m0_cmg5_1 = arith.constant 128 : index + %c1024_m0_cmg5_1 = arith.constant 1024 : index + %c0_i64_m0_cmg5_1 = arith.constant 0 : i64 + %c1_i64_m0_cmg5_1 = arith.constant 1 : i64 + %c32_i64_m0_cmg5_1 = arith.constant 32 : i64 + %c64_i64_m0_cmg5_1 = arith.constant 64 : i64 + %c128_i64_m0_cmg5_1 = arith.constant 128 : i64 + %c4096_i64_m0_cmg5_1 = arith.constant 4096 : i64 + %c1000_i32_m0_cmg5_1 = arith.constant 1000 : i32 + + %ub_in_m0_cmg5_1 = pto.castptr %c0_i64_m0_cmg5_1 : i64 -> !pto.ptr + %ub_out_m0_cmg5_1 = pto.castptr %c4096_i64_m0_cmg5_1 : i64 -> !pto.ptr + + %false_m0_cmg5_1 = arith.constant false + pto.mte_gm_ub %arg0, %ub_in_m0_cmg5_1, %c0_i64_m0_cmg5_1, %c128_i64_m0_cmg5_1 + nburst(%c32_i64_m0_cmg5_1, %c128_i64_m0_cmg5_1, %c128_i64_m0_cmg5_1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %__m0_cmg5_1:1 = scf.for %offset_m0_cmg5_1 = %c0_m0_cmg5_1 to %c1024_m0_cmg5_1 step %c128_m0_cmg5_1 iter_args(%remaining_m0_cmg5_1 = %c1000_i32_m0_cmg5_1) -> (i32) { + %mask_m0_cmg5_1, %next_remaining_m0_cmg5_1 = pto.plt_b16 %remaining_m0_cmg5_1 : i32 -> !pto.mask, i32 + %lower_mask_m0_cmg5_1, %upper_remaining_m0_cmg5_1 = pto.plt_b32 %remaining_m0_cmg5_1 : i32 -> !pto.mask, i32 + %upper_mask_m0_cmg5_1, %_next_b32_m0_cmg5_1 = pto.plt_b32 %upper_remaining_m0_cmg5_1 : i32 -> !pto.mask, i32 + %upper_offset_m0_cmg5_1 = arith.addi %offset_m0_cmg5_1, %c64_m0_cmg5_1 : index + %lower_m0_cmg5_1 = pto.vlds %ub_in_m0_cmg5_1[%offset_m0_cmg5_1] : !pto.ptr -> !pto.vreg<64xf32> + %upper_m0_cmg5_1 = pto.vlds %ub_in_m0_cmg5_1[%upper_offset_m0_cmg5_1] : !pto.ptr -> !pto.vreg<64xf32> + %even_m0_cmg5_1 = pto.vcvt %lower_m0_cmg5_1, %lower_mask_m0_cmg5_1 {rnd = "R", sat = "SAT", part = "EVEN"} : !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<128xf16> + %odd_m0_cmg5_1 = pto.vcvt %upper_m0_cmg5_1, %upper_mask_m0_cmg5_1 {rnd = "R", sat = "SAT", part = "ODD"} : !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<128xf16> + %merged_m0_cmg5_1 = pto.vor %even_m0_cmg5_1, %odd_m0_cmg5_1, %mask_m0_cmg5_1 : !pto.vreg<128xf16>, !pto.vreg<128xf16>, !pto.mask -> !pto.vreg<128xf16> + pto.vsts %merged_m0_cmg5_1, %ub_out_m0_cmg5_1[%offset_m0_cmg5_1], %mask_m0_cmg5_1 : !pto.vreg<128xf16>, !pto.ptr, !pto.mask + scf.yield %next_remaining_m0_cmg5_1 : i32 + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m0_cmg5_1, %arg1, %c64_i64_m0_cmg5_1 + nburst(%c32_i64_m0_cmg5_1, %c64_i64_m0_cmg5_1, %c64_i64_m0_cmg5_1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + + } + // active merged from vcvt_tail_kernel_2d + + %c0_m1_cmg5_1 = arith.constant 0 : index + %c1_m1_cmg5_1 = arith.constant 1 : index + %c64_m1_cmg5_1 = arith.constant 64 : index + %c128_m1_cmg5_1 = arith.constant 128 : index + %c1024_m1_cmg5_1 = arith.constant 1024 : index + %c0_i64_m1_cmg5_1 = arith.constant 0 : i64 + %c1_i64_m1_cmg5_1 = arith.constant 1 : i64 + %c32_i64_m1_cmg5_1 = arith.constant 32 : i64 + %c64_i64_m1_cmg5_1 = arith.constant 64 : i64 + %c128_i64_m1_cmg5_1 = arith.constant 128 : i64 + %c4096_i64_m1_cmg5_1 = arith.constant 4096 : i64 + %c1000_i32_m1_cmg5_1 = arith.constant 1000 : i32 + + %ub_in_m1_cmg5_1 = pto.castptr %c0_i64_m1_cmg5_1 : i64 -> !pto.ptr + %ub_out_m1_cmg5_1 = pto.castptr %c4096_i64_m1_cmg5_1 : i64 -> !pto.ptr + + %false_m1_cmg5_1 = arith.constant false + pto.mte_gm_ub %arg2, %ub_in_m1_cmg5_1, %c0_i64_m1_cmg5_1, %c128_i64_m1_cmg5_1 + nburst(%c32_i64_m1_cmg5_1, %c128_i64_m1_cmg5_1, %c128_i64_m1_cmg5_1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %__m1_cmg5_1:1 = scf.for %offset_m1_cmg5_1 = %c0_m1_cmg5_1 to %c1024_m1_cmg5_1 step %c128_m1_cmg5_1 iter_args(%remaining_m1_cmg5_1 = %c1000_i32_m1_cmg5_1) -> (i32) { + %mask_m1_cmg5_1, %next_remaining_m1_cmg5_1 = pto.plt_b16 %remaining_m1_cmg5_1 : i32 -> !pto.mask, i32 + %lower_mask_m1_cmg5_1, %upper_remaining_m1_cmg5_1 = pto.plt_b32 %remaining_m1_cmg5_1 : i32 -> !pto.mask, i32 + %upper_mask_m1_cmg5_1, %_next_b32_m1_cmg5_1 = pto.plt_b32 %upper_remaining_m1_cmg5_1 : i32 -> !pto.mask, i32 + %upper_offset_m1_cmg5_1 = arith.addi %offset_m1_cmg5_1, %c64_m1_cmg5_1 : index + %lower_m1_cmg5_1 = pto.vlds %ub_in_m1_cmg5_1[%offset_m1_cmg5_1] : !pto.ptr -> !pto.vreg<64xf32> + %upper_m1_cmg5_1 = pto.vlds %ub_in_m1_cmg5_1[%upper_offset_m1_cmg5_1] : !pto.ptr -> !pto.vreg<64xf32> + %even_m1_cmg5_1 = pto.vcvt %lower_m1_cmg5_1, %lower_mask_m1_cmg5_1 {rnd = "R", sat = "SAT", part = "EVEN"} : !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<128xf16> + %odd_m1_cmg5_1 = pto.vcvt %upper_m1_cmg5_1, %upper_mask_m1_cmg5_1 {rnd = "R", sat = "SAT", part = "ODD"} : !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<128xf16> + %merged_m1_cmg5_1 = pto.vor %even_m1_cmg5_1, %odd_m1_cmg5_1, %mask_m1_cmg5_1 : !pto.vreg<128xf16>, !pto.vreg<128xf16>, !pto.mask -> !pto.vreg<128xf16> + pto.vsts %merged_m1_cmg5_1, %ub_out_m1_cmg5_1[%offset_m1_cmg5_1], %mask_m1_cmg5_1 : !pto.vreg<128xf16>, !pto.ptr, !pto.mask + scf.yield %next_remaining_m1_cmg5_1 : i32 + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m1_cmg5_1, %arg3, %c64_i64_m1_cmg5_1 + nburst(%c32_i64_m1_cmg5_1, %c64_i64_m1_cmg5_1, %c64_i64_m1_cmg5_1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + } + return } } diff --git a/test/vpto/cases/micro-op/conversion/vcvt-f32-to-f16/launch.cpp b/test/vpto/cases/micro-op/conversion/vcvt-f32-to-f16/launch.cpp index 8dcc00348..9ca5830c4 100644 --- a/test/vpto/cases/micro-op/conversion/vcvt-f32-to-f16/launch.cpp +++ b/test/vpto/cases/micro-op/conversion/vcvt-f32-to-f16/launch.cpp @@ -5,30 +5,9 @@ // THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, // INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. // See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/conversion/vcvt-f32-to-f16 -// family: conversion -// target_ops: pto.vcvt -// scenarios: f32-to-f16, full-mask -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -// --------------------------------------------------------------------------- -// PTOAS compatibility layer -// -// The upstream pto-isa headers reference some FP8/FP4 types and the -// __VEC_SCOPE__ marker that are not available on every AICore arch/toolchain -// combination (e.g. __NPU_ARCH__==2201). -// -// For our PTOAS-generated kernels we don't rely on these types today, but the -// headers still mention them in templates/static_asserts. Provide minimal -// fallbacks to keep compilation working on dav-c220. -// --------------------------------------------------------------------------- #ifndef __VEC_SCOPE__ #define __VEC_SCOPE__ #endif - #if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) typedef struct { unsigned char v; } hifloat8_t; typedef struct { unsigned char v; } float8_e4m3_t; @@ -38,34 +17,26 @@ typedef struct { unsigned char v; } float4_e1m2x2_t; typedef struct { unsigned char v; } float4_e2m1x2_t; #endif #include - -// AICore printf support is gated behind `--cce-enable-print` on some -// toolchains. When enabled, include the CCE print header so `cce::printf` -// resolves in device compilation. #if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) #include #endif - -// Some PTO-ISA types are only available in the __CCE_AICORE__ compilation -// path, but `bisheng -xcce` still performs a host-side parse pass. -// Provide minimal fallbacks only when the corresponding header wasn't -// pulled in by the selected arch implementation. #if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; +struct MrgSortExecutedNumList { uint16_t mrgSortList0,mrgSortList1,mrgSortList2,mrgSortList3; }; #endif #ifndef __CPU_SIM #include "acl/acl.h" #endif -extern "C" __global__ [aicore] void vcvt_f32_to_f16_kernel_2d(__gm__ float *v1, - __gm__ half *v2); +extern "C" __global__ [aicore] void vcvt_f32_to_f16_deep_merged_kernel( + __gm__ float * arg0, + __gm__ half * arg1, + __gm__ float * arg2, + __gm__ half * arg3); -void LaunchVcvt_f32_to_f16_kernel_2d(float *v1, uint16_t *v2, void *stream) { - vcvt_f32_to_f16_kernel_2d<<<1, nullptr, stream>>>((__gm__ float *)v1, - (__gm__ half *)v2); +void LaunchVcvtF32ToF16DeepMerged(float * p0, uint16_t * p1, void *stream) { + vcvt_f32_to_f16_deep_merged_kernel<<<1, nullptr, stream>>>( + (__gm__ float *)p0, + (__gm__ half *)p0, + (__gm__ float *)p0, + (__gm__ half *)p1); } diff --git a/test/vpto/cases/micro-op/conversion/vcvt-f32-to-f16/main.cpp b/test/vpto/cases/micro-op/conversion/vcvt-f32-to-f16/main.cpp index cf7a6c2de..dd189117a 100644 --- a/test/vpto/cases/micro-op/conversion/vcvt-f32-to-f16/main.cpp +++ b/test/vpto/cases/micro-op/conversion/vcvt-f32-to-f16/main.cpp @@ -55,8 +55,8 @@ struct MrgSortExecutedNumList { } \ } while (0) -void LaunchVcvt_f32_to_f16_kernel_2d(float *v1, uint16_t *v2, void *stream); +void LaunchVcvtF32ToF16DeepMerged(float * p0, uint16_t * p1, void *stream); int main() { size_t elemCount_v1 = 1024; size_t fileSize_v1 = elemCount_v1 * sizeof(float); @@ -91,7 +91,7 @@ int main() { ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, ACL_MEMCPY_HOST_TO_DEVICE)); ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVcvt_f32_to_f16_kernel_2d(v1Device, v2Device, stream); + LaunchVcvtF32ToF16DeepMerged(v1Device, v2Device, stream); ACL_CHECK(aclrtSynchronizeStream(stream)); ACL_CHECK(aclrtMemcpy(v2Host, fileSize_v2, v2Device, fileSize_v2, ACL_MEMCPY_DEVICE_TO_HOST)); diff --git a/test/vpto/cases/micro-op/conversion/vcvt-tail-special/compare.py b/test/vpto/cases/micro-op/conversion/vcvt-tail-special/compare.py deleted file mode 100644 index 166196a8e..000000000 --- a/test/vpto/cases/micro-op/conversion/vcvt-tail-special/compare.py +++ /dev/null @@ -1,43 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - - -import os -import sys - -import numpy as np - - -LOGICAL_ELEMS = 1000 - - -def compare_bin_prefix(golden_path, output_path, dtype, eps, count): - if not os.path.exists(golden_path) or not os.path.exists(output_path): - return False - golden = np.fromfile(golden_path, dtype=dtype, count=count) - output = np.fromfile(output_path, dtype=dtype, count=count) - return golden.shape == output.shape and np.allclose( - golden, output, atol=eps, rtol=eps, equal_nan=True - ) - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_bin_prefix("golden_v2.bin", "v2.bin", np.float16, 1e-3, LOGICAL_ELEMS) - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/conversion/vcvt-tail-special/golden.py b/test/vpto/cases/micro-op/conversion/vcvt-tail-special/golden.py deleted file mode 100755 index 0b4417baf..000000000 --- a/test/vpto/cases/micro-op/conversion/vcvt-tail-special/golden.py +++ /dev/null @@ -1,95 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/conversion/vcvt-tail-special -# family: conversion -# target_ops: pto.vcvt -# scenarios: f32-to-f16, tail-mask, exceptional-values -# NOTE: bulk-generated coverage skeleton. -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -ROWS = 32 -COLS = 32 -SEED = 19 -LOGICAL_ELEMS = 1000 -F16_MAX_FINITE = np.float32(65504.0) - - -def sat_cast_f32_to_f16(values: np.ndarray) -> np.ndarray: - values = np.where(np.isnan(values), np.float32(0.0), values) - values = np.clip(values, -F16_MAX_FINITE, F16_MAX_FINITE) - return values.astype(np.float16) - - -def generate(output_dir: Path, seed: int) -> None: - del seed - special = np.array( - [ - 0.0, - -0.0, - 1.0, - -1.0, - np.inf, - -np.inf, - np.nan, - 65504.0, - -65504.0, - 1.0e-8, - -1.0e-8, - 1.0e-4, - -1.0e-4, - 123.75, - -123.75, - 0.33333334, - ], - dtype=np.float32, - ) - flat = np.resize(special, ROWS * COLS).astype(np.float32) - flat[LOGICAL_ELEMS:] = 0.0 - v1 = flat.reshape(ROWS, COLS) - v2 = np.zeros((ROWS, COLS), dtype=np.float16) - golden_flat = np.zeros(ROWS * COLS, dtype=np.float16) - - remaining = LOGICAL_ELEMS - for offset in range(0, ROWS * COLS, 128): - lower = sat_cast_f32_to_f16(flat[offset : offset + 64]) - upper = sat_cast_f32_to_f16(flat[offset + 64 : offset + 128]) - merged = np.empty(128, dtype=np.float16) - merged[0::2] = lower - merged[1::2] = upper - active = min(remaining, 128) - golden_flat[offset : offset + active] = merged[:active] - remaining = max(remaining - 128, 0) - - golden_v2 = golden_flat.reshape(ROWS, COLS) - - output_dir.mkdir(parents=True, exist_ok=True) - v1.reshape(-1).tofile(output_dir / "v1.bin") - v2.reshape(-1).tofile(output_dir / "v2.bin") - golden_v2.reshape(-1).tofile(output_dir / "golden_v2.bin") - - -def main() -> None: - parser = argparse.ArgumentParser( - description="Generate numpy-based inputs/golden for VPTO micro-op vcvt-tail-special validation." - ) - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/conversion/vcvt-tail-special/kernel.pto b/test/vpto/cases/micro-op/conversion/vcvt-tail-special/kernel.pto deleted file mode 100644 index 2b5cebf1b..000000000 --- a/test/vpto/cases/micro-op/conversion/vcvt-tail-special/kernel.pto +++ /dev/null @@ -1,52 +0,0 @@ -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vcvt_tail_special_kernel_2d(%arg0: !pto.ptr, - %arg1: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c64 = arith.constant 64 : index - %c128 = arith.constant 128 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c64_i64 = arith.constant 64 : i64 - %c128_i64 = arith.constant 128 : i64 - %c4096_i64 = arith.constant 4096 : i64 - %c1000_i32 = arith.constant 1000 : i32 - - %ub_in = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c4096_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_in, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %_:1 = scf.for %offset = %c0 to %c1024 step %c128 iter_args(%remaining = %c1000_i32) -> (i32) { - %mask, %next_remaining = pto.plt_b16 %remaining : i32 -> !pto.mask, i32 - %lower_mask, %upper_remaining = pto.plt_b32 %remaining : i32 -> !pto.mask, i32 - %upper_mask, %_next_b32 = pto.plt_b32 %upper_remaining : i32 -> !pto.mask, i32 - %upper_offset = arith.addi %offset, %c64 : index - %lower = pto.vlds %ub_in[%offset] : !pto.ptr -> !pto.vreg<64xf32> - %upper = pto.vlds %ub_in[%upper_offset] : !pto.ptr -> !pto.vreg<64xf32> - %even = pto.vcvt %lower, %lower_mask {rnd = "R", sat = "SAT", part = "EVEN"} : !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<128xf16> - %odd = pto.vcvt %upper, %upper_mask {rnd = "R", sat = "SAT", part = "ODD"} : !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<128xf16> - %merged = pto.vor %even, %odd, %mask : !pto.vreg<128xf16>, !pto.vreg<128xf16>, !pto.mask -> !pto.vreg<128xf16> - pto.vsts %merged, %ub_out[%offset], %mask : !pto.vreg<128xf16>, !pto.ptr, !pto.mask - scf.yield %next_remaining : i32 - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg1, %c64_i64 - nburst(%c32_i64, %c64_i64, %c64_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/conversion/vcvt-tail-special/launch.cpp b/test/vpto/cases/micro-op/conversion/vcvt-tail-special/launch.cpp deleted file mode 100644 index 128254d29..000000000 --- a/test/vpto/cases/micro-op/conversion/vcvt-tail-special/launch.cpp +++ /dev/null @@ -1,71 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/conversion/vcvt-tail-special -// family: conversion -// target_ops: pto.vcvt -// scenarios: f32-to-f16, tail-mask, exceptional-values -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -// --------------------------------------------------------------------------- -// PTOAS compatibility layer -// -// The upstream pto-isa headers reference some FP8/FP4 types and the -// __VEC_SCOPE__ marker that are not available on every AICore arch/toolchain -// combination (e.g. __NPU_ARCH__==2201). -// -// For our PTOAS-generated kernels we don't rely on these types today, but the -// headers still mention them in templates/static_asserts. Provide minimal -// fallbacks to keep compilation working on dav-c220. -// --------------------------------------------------------------------------- -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include - -// AICore printf support is gated behind `--cce-enable-print` on some -// toolchains. When enabled, include the CCE print header so `cce::printf` -// resolves in device compilation. -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -// Some PTO-ISA types are only available in the __CCE_AICORE__ compilation -// path, but `bisheng -xcce` still performs a host-side parse pass. -// Provide minimal fallbacks only when the corresponding header wasn't -// pulled in by the selected arch implementation. -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vcvt_tail_special_kernel_2d(__gm__ float *v1, - __gm__ half *v2); - -void LaunchVcvt_tail_special_kernel_2d(float *v1, uint16_t *v2, void *stream) { - vcvt_tail_special_kernel_2d<<<1, nullptr, stream>>>((__gm__ float *)v1, - (__gm__ half *)v2); -} diff --git a/test/vpto/cases/micro-op/conversion/vcvt-tail-special/main.cpp b/test/vpto/cases/micro-op/conversion/vcvt-tail-special/main.cpp deleted file mode 100644 index 155e88b98..000000000 --- a/test/vpto/cases/micro-op/conversion/vcvt-tail-special/main.cpp +++ /dev/null @@ -1,130 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/conversion/vcvt-tail-special -// family: conversion -// target_ops: pto.vcvt -// scenarios: f32-to-f16, tail-mask, exceptional-values -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -This program is free software, you can redistribute it and/or modify it under the terms and conditions of -CANN Open Software License Agreement Version 2.0 (the "License"). -Please refer to the License for details. You may not use this file except in compliance with the License. -THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -See LICENSE in the root of the software repository for the full text of the License. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include -#include - -using namespace PtoTestCommon; - -#ifndef TMRGSORT_HPP -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, (int)_ret, __FILE__, __LINE__); \ - const char *_recent = aclGetRecentErrMsg(); \ - if (_recent != nullptr && _recent[0] != '\0') { \ - std::fprintf(stderr, "[ERROR] RecentErrMsg: %s\n", _recent); \ - } \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVcvt_tail_special_kernel_2d(float *v1, uint16_t *v2, void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(float); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(uint16_t); - float *v1Host = nullptr; - float *v1Device = nullptr; - uint16_t *v2Host = nullptr; - uint16_t *v2Device = nullptr; - - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { - deviceId = std::atoi(envDevice); - } - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVcvt_tail_special_kernel_2d(v1Device, v2Device, stream); - - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v2Host, fileSize_v2, v2Device, fileSize_v2, ACL_MEMCPY_DEVICE_TO_HOST)); - - WriteFile("./v2.bin", v2Host, fileSize_v2); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - if (stream != nullptr) { - const aclError _ret = aclrtDestroyStream(stream); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtDestroyStream(stream)", (int)_ret, __FILE__, __LINE__); - } - stream = nullptr; - } - if (deviceSet) { - const aclError _ret = aclrtResetDevice(deviceId); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtResetDevice(deviceId)", (int)_ret, __FILE__, __LINE__); - } - } - if (aclInited) { - const aclError _ret = aclFinalize(); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclFinalize()", (int)_ret, __FILE__, __LINE__); - } - } - - return rc; -} diff --git a/test/vpto/cases/micro-op/conversion/vcvt-tail/compare.py b/test/vpto/cases/micro-op/conversion/vcvt-tail/compare.py deleted file mode 100644 index 166196a8e..000000000 --- a/test/vpto/cases/micro-op/conversion/vcvt-tail/compare.py +++ /dev/null @@ -1,43 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - - -import os -import sys - -import numpy as np - - -LOGICAL_ELEMS = 1000 - - -def compare_bin_prefix(golden_path, output_path, dtype, eps, count): - if not os.path.exists(golden_path) or not os.path.exists(output_path): - return False - golden = np.fromfile(golden_path, dtype=dtype, count=count) - output = np.fromfile(output_path, dtype=dtype, count=count) - return golden.shape == output.shape and np.allclose( - golden, output, atol=eps, rtol=eps, equal_nan=True - ) - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_bin_prefix("golden_v2.bin", "v2.bin", np.float16, 1e-3, LOGICAL_ELEMS) - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/conversion/vcvt-tail/golden.py b/test/vpto/cases/micro-op/conversion/vcvt-tail/golden.py deleted file mode 100755 index b121f1e29..000000000 --- a/test/vpto/cases/micro-op/conversion/vcvt-tail/golden.py +++ /dev/null @@ -1,68 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/conversion/vcvt-tail -# family: conversion -# target_ops: pto.vcvt -# scenarios: f32-to-f16, tail-mask -# NOTE: bulk-generated coverage skeleton. -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -ROWS = 32 -COLS = 32 -SEED = 19 -LOGICAL_ELEMS = 1000 -OUT_SENTINEL = np.float16(-123.25) - - -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) - flat = rng.uniform(-8.0, 8.0, size=ROWS * COLS).astype(np.float32) - flat[LOGICAL_ELEMS:] = 0.0 - v1 = flat.reshape(ROWS, COLS) - v2 = np.full((ROWS, COLS), OUT_SENTINEL, dtype=np.float16) - golden_flat = np.full(ROWS * COLS, OUT_SENTINEL, dtype=np.float16) - - remaining = LOGICAL_ELEMS - for offset in range(0, ROWS * COLS, 128): - lower = flat[offset : offset + 64].astype(np.float16) - upper = flat[offset + 64 : offset + 128].astype(np.float16) - merged = np.empty(128, dtype=np.float16) - merged[0::2] = lower - merged[1::2] = upper - active = min(remaining, 128) - golden_flat[offset : offset + active] = merged[:active] - remaining = max(remaining - 128, 0) - - golden_v2 = golden_flat.reshape(ROWS, COLS) - - output_dir.mkdir(parents=True, exist_ok=True) - v1.reshape(-1).tofile(output_dir / "v1.bin") - v2.reshape(-1).tofile(output_dir / "v2.bin") - golden_v2.reshape(-1).tofile(output_dir / "golden_v2.bin") - - -def main() -> None: - parser = argparse.ArgumentParser( - description="Generate numpy-based inputs/golden for VPTO micro-op vcvt-tail validation." - ) - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/conversion/vcvt-tail/kernel.pto b/test/vpto/cases/micro-op/conversion/vcvt-tail/kernel.pto deleted file mode 100644 index e724ee2b1..000000000 --- a/test/vpto/cases/micro-op/conversion/vcvt-tail/kernel.pto +++ /dev/null @@ -1,52 +0,0 @@ -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vcvt_tail_kernel_2d(%arg0: !pto.ptr, - %arg1: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c64 = arith.constant 64 : index - %c128 = arith.constant 128 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c64_i64 = arith.constant 64 : i64 - %c128_i64 = arith.constant 128 : i64 - %c4096_i64 = arith.constant 4096 : i64 - %c1000_i32 = arith.constant 1000 : i32 - - %ub_in = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c4096_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_in, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %_:1 = scf.for %offset = %c0 to %c1024 step %c128 iter_args(%remaining = %c1000_i32) -> (i32) { - %mask, %next_remaining = pto.plt_b16 %remaining : i32 -> !pto.mask, i32 - %lower_mask, %upper_remaining = pto.plt_b32 %remaining : i32 -> !pto.mask, i32 - %upper_mask, %_next_b32 = pto.plt_b32 %upper_remaining : i32 -> !pto.mask, i32 - %upper_offset = arith.addi %offset, %c64 : index - %lower = pto.vlds %ub_in[%offset] : !pto.ptr -> !pto.vreg<64xf32> - %upper = pto.vlds %ub_in[%upper_offset] : !pto.ptr -> !pto.vreg<64xf32> - %even = pto.vcvt %lower, %lower_mask {rnd = "R", sat = "SAT", part = "EVEN"} : !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<128xf16> - %odd = pto.vcvt %upper, %upper_mask {rnd = "R", sat = "SAT", part = "ODD"} : !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<128xf16> - %merged = pto.vor %even, %odd, %mask : !pto.vreg<128xf16>, !pto.vreg<128xf16>, !pto.mask -> !pto.vreg<128xf16> - pto.vsts %merged, %ub_out[%offset], %mask : !pto.vreg<128xf16>, !pto.ptr, !pto.mask - scf.yield %next_remaining : i32 - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg1, %c64_i64 - nburst(%c32_i64, %c64_i64, %c64_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/conversion/vcvt-tail/launch.cpp b/test/vpto/cases/micro-op/conversion/vcvt-tail/launch.cpp deleted file mode 100644 index 5773d5044..000000000 --- a/test/vpto/cases/micro-op/conversion/vcvt-tail/launch.cpp +++ /dev/null @@ -1,71 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/conversion/vcvt-tail -// family: conversion -// target_ops: pto.vcvt -// scenarios: f32-to-f16, tail-mask -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -// --------------------------------------------------------------------------- -// PTOAS compatibility layer -// -// The upstream pto-isa headers reference some FP8/FP4 types and the -// __VEC_SCOPE__ marker that are not available on every AICore arch/toolchain -// combination (e.g. __NPU_ARCH__==2201). -// -// For our PTOAS-generated kernels we don't rely on these types today, but the -// headers still mention them in templates/static_asserts. Provide minimal -// fallbacks to keep compilation working on dav-c220. -// --------------------------------------------------------------------------- -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include - -// AICore printf support is gated behind `--cce-enable-print` on some -// toolchains. When enabled, include the CCE print header so `cce::printf` -// resolves in device compilation. -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -// Some PTO-ISA types are only available in the __CCE_AICORE__ compilation -// path, but `bisheng -xcce` still performs a host-side parse pass. -// Provide minimal fallbacks only when the corresponding header wasn't -// pulled in by the selected arch implementation. -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vcvt_tail_kernel_2d(__gm__ float *v1, - __gm__ half *v2); - -void LaunchVcvt_tail_kernel_2d(float *v1, uint16_t *v2, void *stream) { - vcvt_tail_kernel_2d<<<1, nullptr, stream>>>((__gm__ float *)v1, - (__gm__ half *)v2); -} diff --git a/test/vpto/cases/micro-op/conversion/vcvt-tail/main.cpp b/test/vpto/cases/micro-op/conversion/vcvt-tail/main.cpp deleted file mode 100644 index 9a0abf5cb..000000000 --- a/test/vpto/cases/micro-op/conversion/vcvt-tail/main.cpp +++ /dev/null @@ -1,130 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/conversion/vcvt-tail -// family: conversion -// target_ops: pto.vcvt -// scenarios: f32-to-f16, tail-mask -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -This program is free software, you can redistribute it and/or modify it under the terms and conditions of -CANN Open Software License Agreement Version 2.0 (the "License"). -Please refer to the License for details. You may not use this file except in compliance with the License. -THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -See LICENSE in the root of the software repository for the full text of the License. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include -#include - -using namespace PtoTestCommon; - -#ifndef TMRGSORT_HPP -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, (int)_ret, __FILE__, __LINE__); \ - const char *_recent = aclGetRecentErrMsg(); \ - if (_recent != nullptr && _recent[0] != '\0') { \ - std::fprintf(stderr, "[ERROR] RecentErrMsg: %s\n", _recent); \ - } \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVcvt_tail_kernel_2d(float *v1, uint16_t *v2, void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(float); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(uint16_t); - float *v1Host = nullptr; - float *v1Device = nullptr; - uint16_t *v2Host = nullptr; - uint16_t *v2Device = nullptr; - - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { - deviceId = std::atoi(envDevice); - } - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVcvt_tail_kernel_2d(v1Device, v2Device, stream); - - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v2Host, fileSize_v2, v2Device, fileSize_v2, ACL_MEMCPY_DEVICE_TO_HOST)); - - WriteFile("./v2.bin", v2Host, fileSize_v2); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - if (stream != nullptr) { - const aclError _ret = aclrtDestroyStream(stream); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtDestroyStream(stream)", (int)_ret, __FILE__, __LINE__); - } - stream = nullptr; - } - if (deviceSet) { - const aclError _ret = aclrtResetDevice(deviceId); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtResetDevice(deviceId)", (int)_ret, __FILE__, __LINE__); - } - } - if (aclInited) { - const aclError _ret = aclFinalize(); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclFinalize()", (int)_ret, __FILE__, __LINE__); - } - } - - return rc; -} diff --git a/test/vpto/cases/micro-op/conversion/vtrc-f32-rounding/kernel.pto b/test/vpto/cases/micro-op/conversion/vtrc-f32-rounding/kernel.pto index e1756ace6..4f296b5be 100644 --- a/test/vpto/cases/micro-op/conversion/vtrc-f32-rounding/kernel.pto +++ b/test/vpto/cases/micro-op/conversion/vtrc-f32-rounding/kernel.pto @@ -1,79 +1,160 @@ -// ============================================================================= -// abs_kernel_2d: Element-wise absolute value on a 32x32 f32 tile -// ============================================================================= -// This kernel computes abs(input) for a 32x32 float32 matrix (1024 elements). -// -// Memory Layout: -// - Input: arg0 -> GM (Global Memory) -// - Output: arg1 -> GM (Global Memory) -// - UB (Unified Buffer) at offset 0: input tile (4096 bytes = 32*32*4) -// - UB at offset 4096: output tile (4096 bytes = 32*32*4) -// -// Pipeline: -// 1. DMA: GM -> UB (MTE2 pipe) - copy input tile to UB -// 2. Sync: wait for MTE2 -> V pipe handoff -// 3. Compute: vabs on 64-element vectors (V pipe) - 16 iterations for 1024 elements -// 4. Sync: wait for V -> MTE3 pipe handoff -// 5. DMA: UB -> GM (MTE3 pipe) - copy result tile back to GM -// ============================================================================= - module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vtrc_f32_rounding_kernel_2d(%arg0: !pto.ptr, - %arg1: !pto.ptr, - %arg2: !pto.ptr, - %arg3: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c64 = arith.constant 64 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c128_i64 = arith.constant 128 : i64 - %c4096_i64 = arith.constant 4096 : i64 - %c1024_i32 = arith.constant 1024 : i32 - - %ub_in = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_r = pto.castptr %c4096_i64 : i64 -> !pto.ptr - %c8192_i64 = arith.constant 8192 : i64 - %c12288_i64 = arith.constant 12288 : i64 - %ub_z = pto.castptr %c8192_i64 : i64 -> !pto.ptr - %ub_f = pto.castptr %c12288_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_in, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) + func.func @vtrc_f32_rounding_deep_merged_kernel(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: !pto.ptr, %arg4: !pto.ptr, %arg5: !pto.ptr, %arg6: !pto.ptr, %arg7: !pto.ptr, %arg8: !pto.ptr, %arg9: !pto.ptr) attributes {pto.kernel} { + %__deep_merge_guard = arith.constant false + // inactive merged from vtrc_f32_special_kernel_2d + scf.if %__deep_merge_guard { + + %c0_m0 = arith.constant 0 : index + %c64_m0 = arith.constant 64 : index + %c1024_m0 = arith.constant 1024 : index + %c0_i64_m0 = arith.constant 0 : i64 + %c1_i64_m0 = arith.constant 1 : i64 + %c32_i64_m0 = arith.constant 32 : i64 + %c128_i64_m0 = arith.constant 128 : i64 + %c4096_i64_m0 = arith.constant 4096 : i64 + %c1024_i32_m0 = arith.constant 1024 : i32 + + %ub_in_m0 = pto.castptr %c0_i64_m0 : i64 -> !pto.ptr + %ub_out_m0 = pto.castptr %c4096_i64_m0 : i64 -> !pto.ptr + + %false_m0 = arith.constant false + pto.mte_gm_ub %arg0, %ub_in_m0, %c0_i64_m0, %c128_i64_m0 + nburst(%c32_i64_m0, %c128_i64_m0, %c128_i64_m0) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %__m0:1 = scf.for %offset_m0 = %c0_m0 to %c1024_m0 step %c64_m0 iter_args(%remaining_m0 = %c1024_i32_m0) -> (i32) { + %mask_m0, %next_remaining_m0 = pto.plt_b32 %remaining_m0 : i32 -> !pto.mask, i32 + %vec_m0 = pto.vlds %ub_in_m0[%offset_m0] : !pto.ptr -> !pto.vreg<64xf32> + %out_m0 = pto.vtrc %vec_m0, %mask_m0, "Z" : !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> + pto.vsts %out_m0, %ub_out_m0[%offset_m0], %mask_m0 : !pto.vreg<64xf32>, !pto.ptr, !pto.mask + scf.yield %next_remaining_m0 : i32 + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m0, %arg1, %c128_i64_m0 + nburst(%c32_i64_m0, %c128_i64_m0, %c128_i64_m0) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + + } + // inactive merged from vtrc_f32_rounding_kernel_2d_vtrc_rounding_boundary + scf.if %__deep_merge_guard { + + %c0_m1 = arith.constant 0 : index + %c1_m1 = arith.constant 1 : index + %c64_m1 = arith.constant 64 : index + %c1024_m1 = arith.constant 1024 : index + %c0_i64_m1 = arith.constant 0 : i64 + %c1_i64_m1 = arith.constant 1 : i64 + %c32_i64_m1 = arith.constant 32 : i64 + %c128_i64_m1 = arith.constant 128 : i64 + %c4096_i64_m1 = arith.constant 4096 : i64 + %c1024_i32_m1 = arith.constant 1024 : i32 + + %ub_in_m1 = pto.castptr %c0_i64_m1 : i64 -> !pto.ptr + %ub_r_m1 = pto.castptr %c4096_i64_m1 : i64 -> !pto.ptr + %c8192_i64_m1 = arith.constant 8192 : i64 + %c12288_i64_m1 = arith.constant 12288 : i64 + %ub_z_m1 = pto.castptr %c8192_i64_m1 : i64 -> !pto.ptr + %ub_f_m1 = pto.castptr %c12288_i64_m1 : i64 -> !pto.ptr + + %false_m1 = arith.constant false + pto.mte_gm_ub %arg2, %ub_in_m1, %c0_i64_m1, %c128_i64_m1 + nburst(%c32_i64_m1, %c128_i64_m1, %c128_i64_m1) : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] pto.vecscope { - %_:1 = scf.for %offset = %c0 to %c1024 step %c64 iter_args(%remaining = %c1024_i32) -> (i32) { - %mask, %next_remaining = pto.plt_b32 %remaining : i32 -> !pto.mask, i32 - %vec = pto.vlds %ub_in[%offset] : !pto.ptr -> !pto.vreg<64xf32> - %out_r = pto.vtrc %vec, %mask, "R" : !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> - %out_z = pto.vtrc %vec, %mask, "Z" : !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> - %out_f = pto.vtrc %vec, %mask, "F" : !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> - pto.vsts %out_r, %ub_r[%offset], %mask : !pto.vreg<64xf32>, !pto.ptr, !pto.mask - pto.vsts %out_z, %ub_z[%offset], %mask : !pto.vreg<64xf32>, !pto.ptr, !pto.mask - pto.vsts %out_f, %ub_f[%offset], %mask : !pto.vreg<64xf32>, !pto.ptr, !pto.mask - scf.yield %next_remaining : i32 + %__m1:1 = scf.for %offset_m1 = %c0_m1 to %c1024_m1 step %c64_m1 iter_args(%remaining_m1 = %c1024_i32_m1) -> (i32) { + %mask_m1, %next_remaining_m1 = pto.plt_b32 %remaining_m1 : i32 -> !pto.mask, i32 + %vec_m1 = pto.vlds %ub_in_m1[%offset_m1] : !pto.ptr -> !pto.vreg<64xf32> + %out_r_m1 = pto.vtrc %vec_m1, %mask_m1, "R" : !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> + %out_z_m1 = pto.vtrc %vec_m1, %mask_m1, "Z" : !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> + %out_f_m1 = pto.vtrc %vec_m1, %mask_m1, "F" : !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> + pto.vsts %out_r_m1, %ub_r_m1[%offset_m1], %mask_m1 : !pto.vreg<64xf32>, !pto.ptr, !pto.mask + pto.vsts %out_z_m1, %ub_z_m1[%offset_m1], %mask_m1 : !pto.vreg<64xf32>, !pto.ptr, !pto.mask + pto.vsts %out_f_m1, %ub_f_m1[%offset_m1], %mask_m1 : !pto.vreg<64xf32>, !pto.ptr, !pto.mask + scf.yield %next_remaining_m1 : i32 } } pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_r, %arg1, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) + pto.mte_ub_gm %ub_r_m1, %arg3, %c128_i64_m1 + nburst(%c32_i64_m1, %c128_i64_m1, %c128_i64_m1) : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.mte_ub_gm %ub_z, %arg2, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) + pto.mte_ub_gm %ub_z_m1, %arg4, %c128_i64_m1 + nburst(%c32_i64_m1, %c128_i64_m1, %c128_i64_m1) : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.mte_ub_gm %ub_f, %arg3, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) + pto.mte_ub_gm %ub_f_m1, %arg5, %c128_i64_m1 + nburst(%c32_i64_m1, %c128_i64_m1, %c128_i64_m1) : !pto.ptr, !pto.ptr, i64, i64, i64, i64 pto.barrier #pto.pipe + + } + // active merged from vtrc_f32_rounding_kernel_2d + + %c0_m2 = arith.constant 0 : index + %c1_m2 = arith.constant 1 : index + %c64_m2 = arith.constant 64 : index + %c1024_m2 = arith.constant 1024 : index + %c0_i64_m2 = arith.constant 0 : i64 + %c1_i64_m2 = arith.constant 1 : i64 + %c32_i64_m2 = arith.constant 32 : i64 + %c128_i64_m2 = arith.constant 128 : i64 + %c4096_i64_m2 = arith.constant 4096 : i64 + %c1024_i32_m2 = arith.constant 1024 : i32 + + %ub_in_m2 = pto.castptr %c0_i64_m2 : i64 -> !pto.ptr + %ub_r_m2 = pto.castptr %c4096_i64_m2 : i64 -> !pto.ptr + %c8192_i64_m2 = arith.constant 8192 : i64 + %c12288_i64_m2 = arith.constant 12288 : i64 + %ub_z_m2 = pto.castptr %c8192_i64_m2 : i64 -> !pto.ptr + %ub_f_m2 = pto.castptr %c12288_i64_m2 : i64 -> !pto.ptr + + %false_m2 = arith.constant false + pto.mte_gm_ub %arg6, %ub_in_m2, %c0_i64_m2, %c128_i64_m2 + nburst(%c32_i64_m2, %c128_i64_m2, %c128_i64_m2) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %__m2:1 = scf.for %offset_m2 = %c0_m2 to %c1024_m2 step %c64_m2 iter_args(%remaining_m2 = %c1024_i32_m2) -> (i32) { + %mask_m2, %next_remaining_m2 = pto.plt_b32 %remaining_m2 : i32 -> !pto.mask, i32 + %vec_m2 = pto.vlds %ub_in_m2[%offset_m2] : !pto.ptr -> !pto.vreg<64xf32> + %out_r_m2 = pto.vtrc %vec_m2, %mask_m2, "R" : !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> + %out_z_m2 = pto.vtrc %vec_m2, %mask_m2, "Z" : !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> + %out_f_m2 = pto.vtrc %vec_m2, %mask_m2, "F" : !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> + pto.vsts %out_r_m2, %ub_r_m2[%offset_m2], %mask_m2 : !pto.vreg<64xf32>, !pto.ptr, !pto.mask + pto.vsts %out_z_m2, %ub_z_m2[%offset_m2], %mask_m2 : !pto.vreg<64xf32>, !pto.ptr, !pto.mask + pto.vsts %out_f_m2, %ub_f_m2[%offset_m2], %mask_m2 : !pto.vreg<64xf32>, !pto.ptr, !pto.mask + scf.yield %next_remaining_m2 : i32 + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_r_m2, %arg7, %c128_i64_m2 + nburst(%c32_i64_m2, %c128_i64_m2, %c128_i64_m2) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.mte_ub_gm %ub_z_m2, %arg8, %c128_i64_m2 + nburst(%c32_i64_m2, %c128_i64_m2, %c128_i64_m2) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.mte_ub_gm %ub_f_m2, %arg9, %c128_i64_m2 + nburst(%c32_i64_m2, %c128_i64_m2, %c128_i64_m2) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + return } } diff --git a/test/vpto/cases/micro-op/conversion/vtrc-f32-rounding/launch.cpp b/test/vpto/cases/micro-op/conversion/vtrc-f32-rounding/launch.cpp index 6e4f1a142..ae4b92425 100644 --- a/test/vpto/cases/micro-op/conversion/vtrc-f32-rounding/launch.cpp +++ b/test/vpto/cases/micro-op/conversion/vtrc-f32-rounding/launch.cpp @@ -5,22 +5,9 @@ // THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, // INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. // See LICENSE in the root of the software repository for the full text of the License. - -// --------------------------------------------------------------------------- -// PTOAS compatibility layer -// -// The upstream pto-isa headers reference some FP8/FP4 types and the -// __VEC_SCOPE__ marker that are not available on every AICore arch/toolchain -// combination (e.g. __NPU_ARCH__==2201). -// -// For our PTOAS-generated kernels we don't rely on these types today, but the -// headers still mention them in templates/static_asserts. Provide minimal -// fallbacks to keep compilation working on dav-c220. -// --------------------------------------------------------------------------- #ifndef __VEC_SCOPE__ #define __VEC_SCOPE__ #endif - #if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) typedef struct { unsigned char v; } hifloat8_t; typedef struct { unsigned char v; } float8_e4m3_t; @@ -30,39 +17,38 @@ typedef struct { unsigned char v; } float4_e1m2x2_t; typedef struct { unsigned char v; } float4_e2m1x2_t; #endif #include - -// AICore printf support is gated behind `--cce-enable-print` on some -// toolchains. When enabled, include the CCE print header so `cce::printf` -// resolves in device compilation. #if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) #include #endif - -// Some PTO-ISA types are only available in the __CCE_AICORE__ compilation -// path, but `bisheng -xcce` still performs a host-side parse pass. -// Provide minimal fallbacks only when the corresponding header wasn't -// pulled in by the selected arch implementation. #if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; +struct MrgSortExecutedNumList { uint16_t mrgSortList0,mrgSortList1,mrgSortList2,mrgSortList3; }; #endif #ifndef __CPU_SIM #include "acl/acl.h" #endif -extern "C" __global__ [aicore] void vtrc_f32_rounding_kernel_2d(__gm__ float *v1, - __gm__ float *v2, - __gm__ float *v3, - __gm__ float *v4); +extern "C" __global__ [aicore] void vtrc_f32_rounding_deep_merged_kernel( + __gm__ float * arg0, + __gm__ float * arg1, + __gm__ float * arg2, + __gm__ float * arg3, + __gm__ float * arg4, + __gm__ float * arg5, + __gm__ float * arg6, + __gm__ float * arg7, + __gm__ float * arg8, + __gm__ float * arg9); -void LaunchVtrc_f32_rounding_kernel_2d(float *v1, float *v2, float *v3, - float *v4, void *stream) { - vtrc_f32_rounding_kernel_2d<<<1, nullptr, stream>>>((__gm__ float *)v1, - (__gm__ float *)v2, - (__gm__ float *)v3, - (__gm__ float *)v4); +void LaunchVtrcF32RoundingDeepMerged(float * p0, float * p1, float * p2, float * p3, void *stream) { + vtrc_f32_rounding_deep_merged_kernel<<<1, nullptr, stream>>>( + (__gm__ float *)p0, + (__gm__ float *)p0, + (__gm__ float *)p0, + (__gm__ float *)p0, + (__gm__ float *)p0, + (__gm__ float *)p0, + (__gm__ float *)p0, + (__gm__ float *)p1, + (__gm__ float *)p2, + (__gm__ float *)p3); } diff --git a/test/vpto/cases/micro-op/conversion/vtrc-f32-rounding/main.cpp b/test/vpto/cases/micro-op/conversion/vtrc-f32-rounding/main.cpp index b86de567c..0b1208c36 100644 --- a/test/vpto/cases/micro-op/conversion/vtrc-f32-rounding/main.cpp +++ b/test/vpto/cases/micro-op/conversion/vtrc-f32-rounding/main.cpp @@ -47,9 +47,8 @@ struct MrgSortExecutedNumList { } \ } while (0) -void LaunchVtrc_f32_rounding_kernel_2d(float *v1, float *v2, float *v3, - float *v4, void *stream); +void LaunchVtrcF32RoundingDeepMerged(float * p0, float * p1, float * p2, float * p3, void *stream); int main() { size_t elemCount_v1 = 1024; size_t fileSize_v1 = elemCount_v1 * sizeof(float); @@ -100,7 +99,13 @@ int main() { ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, ACL_MEMCPY_HOST_TO_DEVICE)); ACL_CHECK(aclrtMemcpy(v3Device, fileSize_v3, v3Host, fileSize_v3, ACL_MEMCPY_HOST_TO_DEVICE)); ACL_CHECK(aclrtMemcpy(v4Device, fileSize_v4, v4Host, fileSize_v4, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVtrc_f32_rounding_kernel_2d(v1Device, v2Device, v3Device, v4Device, stream); + LaunchVtrcF32RoundingDeepMerged( + v1Device, + v2Device, + v3Device, + v4Device, + stream + ); ACL_CHECK(aclrtSynchronizeStream(stream)); ACL_CHECK(aclrtMemcpy(v2Host, fileSize_v2, v2Device, fileSize_v2, ACL_MEMCPY_DEVICE_TO_HOST)); diff --git a/test/vpto/cases/micro-op/conversion/vtrc-f32-special/compare.py b/test/vpto/cases/micro-op/conversion/vtrc-f32-special/compare.py deleted file mode 100644 index 38d1deb75..000000000 --- a/test/vpto/cases/micro-op/conversion/vtrc-f32-special/compare.py +++ /dev/null @@ -1,39 +0,0 @@ -#!/usr/bin/python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - - -import os -import sys -import numpy as np - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(golden_path) or not os.path.exists(output_path): - return False - golden = np.fromfile(golden_path, dtype=dtype) - output = np.fromfile(output_path, dtype=dtype) - return golden.shape == output.shape and np.allclose( - golden, output, atol=eps, rtol=eps, equal_nan=True - ) - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_bin("golden_v2.bin", "v2.bin", np.float32, 1e-4) - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/conversion/vtrc-f32-special/golden.py b/test/vpto/cases/micro-op/conversion/vtrc-f32-special/golden.py deleted file mode 100644 index f6251171d..000000000 --- a/test/vpto/cases/micro-op/conversion/vtrc-f32-special/golden.py +++ /dev/null @@ -1,50 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -ROWS = 32 -COLS = 32 -SEED = 19 - - -def generate(output_dir: Path, seed: int) -> None: - del seed - specials = np.array( - [-np.inf, -7.5, -0.0, 0.0, 1.0, np.inf, np.nan, 3.5], - dtype=np.float32, - ) - v1 = np.resize(specials, ROWS * COLS).reshape(ROWS, COLS).astype(np.float32) - v2 = np.zeros((ROWS, COLS), dtype=np.float32) - golden_v2 = np.trunc(v1).astype(np.float32, copy=False) - - output_dir.mkdir(parents=True, exist_ok=True) - v1.reshape(-1).tofile(output_dir / "v1.bin") - v2.reshape(-1).tofile(output_dir / "v2.bin") - golden_v2.reshape(-1).tofile(output_dir / "golden_v2.bin") - - -def main() -> None: - parser = argparse.ArgumentParser( - description="Generate numpy-based inputs/golden for VPTO micro-op vtrc-f32-special validation." - ) - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/conversion/vtrc-f32-special/kernel.pto b/test/vpto/cases/micro-op/conversion/vtrc-f32-special/kernel.pto deleted file mode 100644 index 3d834d945..000000000 --- a/test/vpto/cases/micro-op/conversion/vtrc-f32-special/kernel.pto +++ /dev/null @@ -1,42 +0,0 @@ -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vtrc_f32_special_kernel_2d(%arg0: !pto.ptr, %arg1: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c64 = arith.constant 64 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c128_i64 = arith.constant 128 : i64 - %c4096_i64 = arith.constant 4096 : i64 - %c1024_i32 = arith.constant 1024 : i32 - - %ub_in = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c4096_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_in, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %_:1 = scf.for %offset = %c0 to %c1024 step %c64 iter_args(%remaining = %c1024_i32) -> (i32) { - %mask, %next_remaining = pto.plt_b32 %remaining : i32 -> !pto.mask, i32 - %vec = pto.vlds %ub_in[%offset] : !pto.ptr -> !pto.vreg<64xf32> - %out = pto.vtrc %vec, %mask, "Z" : !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> - pto.vsts %out, %ub_out[%offset], %mask : !pto.vreg<64xf32>, !pto.ptr, !pto.mask - scf.yield %next_remaining : i32 - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg1, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/conversion/vtrc-f32-special/launch.cpp b/test/vpto/cases/micro-op/conversion/vtrc-f32-special/launch.cpp deleted file mode 100644 index 4d1ad9527..000000000 --- a/test/vpto/cases/micro-op/conversion/vtrc-f32-special/launch.cpp +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include - -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vtrc_f32_special_kernel_2d(__gm__ float *v1, - __gm__ float *v2); - -void LaunchVtrc_f32_special_kernel_2d(float *v1, float *v2, void *stream) { - vtrc_f32_special_kernel_2d<<<1, nullptr, stream>>>((__gm__ float *)v1, - (__gm__ float *)v2); -} diff --git a/test/vpto/cases/micro-op/conversion/vtrc-f32-special/main.cpp b/test/vpto/cases/micro-op/conversion/vtrc-f32-special/main.cpp deleted file mode 100644 index 40f3aa5ae..000000000 --- a/test/vpto/cases/micro-op/conversion/vtrc-f32-special/main.cpp +++ /dev/null @@ -1,83 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include - -using namespace PtoTestCommon; - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, \ - (int)_ret, __FILE__, __LINE__); \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVtrc_f32_special_kernel_2d(float *v1, float *v2, void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(float); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(float); - float *v1Host = nullptr; - float *v1Device = nullptr; - float *v2Host = nullptr; - float *v2Device = nullptr; - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) - deviceId = std::atoi(envDevice); - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, - ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVtrc_f32_special_kernel_2d(v1Device, v2Device, stream); - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v2Host, fileSize_v2, v2Device, fileSize_v2, - ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v2.bin", v2Host, fileSize_v2); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - if (stream != nullptr) - aclrtDestroyStream(stream); - if (deviceSet) - aclrtResetDevice(deviceId); - if (aclInited) - aclFinalize(); - return rc; -} diff --git a/test/vpto/cases/micro-op/conversion/vtrc-rounding-boundary/compare.py b/test/vpto/cases/micro-op/conversion/vtrc-rounding-boundary/compare.py deleted file mode 100644 index 848571069..000000000 --- a/test/vpto/cases/micro-op/conversion/vtrc-rounding-boundary/compare.py +++ /dev/null @@ -1,206 +0,0 @@ -#!/usr/bin/python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# coding=utf-8 - -import os -import sys -import numpy as np - - -REPEAT_BYTES = 256 - - -def _ceil_div(x, y): - return (x + y - 1) // y - - -def _packed_pred_storage_bytes(logical_elems, src_elem_bytes): - logical_elems = int(logical_elems) - src_elem_bytes = int(src_elem_bytes) - if logical_elems <= 0: - raise ValueError(f"logical_elems must be > 0, got {logical_elems}") - if src_elem_bytes not in (1, 2, 4): - raise ValueError(f"unsupported packed predicate source size: {src_elem_bytes}") - - repeat_elems = REPEAT_BYTES // src_elem_bytes - if src_elem_bytes == 4: - repeat_times = _ceil_div(logical_elems, repeat_elems) + 1 - loop_count = repeat_times // 2 - return loop_count * 16 - - repeat_times = _ceil_div(logical_elems, repeat_elems) - return repeat_times * (repeat_elems // 8) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - g = golden.astype(np.int64, copy=False) - o = output.astype(np.int64, copy=False) - else: - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - abs_diff = np.abs(g - o) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={g[idx]}, out={o[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_bin_prefix(golden_path, output_path, dtype, eps, count): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - try: - count = int(count) - except Exception: - print(f"[ERROR] Invalid prefix count: {count}") - return False - if count <= 0: - print(f"[ERROR] Invalid prefix count: {count}") - return False - - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np, count=count) - output = np.fromfile(output_path, dtype=dtype_np, count=count) - - if golden.size != count or output.size != count: - print( - f"[ERROR] Prefix read too small: need={count} elems, " - f"golden={golden.size}, out={output.size}" - ) - return False - - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - g = golden.astype(np.int64, copy=False) - o = output.astype(np.int64, copy=False) - else: - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - abs_diff = np.abs(g - o) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch (prefix): {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={g[idx]}, out={o[idx]}, dtype={dtype_np}, count={count})" - ) - else: - print(f"[ERROR] Mismatch (prefix): {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, logical_elems, src_elem_bytes): - """ - Compare outputs of pto.tcmp / pto.tcmps. - - PTO-ISA stores packed predicate results as a linear PK byte stream via - `psts`, with the exact written prefix length determined by the typed - TCMP/TCMPS repeat schedule. Compare only that semantic prefix. - """ - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - try: - logical_elems = int(logical_elems) - src_elem_bytes = int(src_elem_bytes) - except Exception: - print( - "[ERROR] Invalid packed mask compare arguments: " - f"logical_elems={logical_elems} src_elem_bytes={src_elem_bytes}" - ) - return False - if logical_elems <= 0 or src_elem_bytes <= 0: - print( - "[ERROR] Invalid packed mask compare arguments: " - f"logical_elems={logical_elems} src_elem_bytes={src_elem_bytes}" - ) - return False - - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - try: - prefix_bytes = _packed_pred_storage_bytes(logical_elems, src_elem_bytes) - except ValueError as exc: - print(f"[ERROR] {exc}") - return False - - if golden.size < prefix_bytes or output.size < prefix_bytes: - print( - f"[ERROR] Packed mask buffer too small: need={prefix_bytes} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - - golden_sel = golden[:prefix_bytes] - output_sel = output[:prefix_bytes] - - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, " - f"logical_elems={logical_elems}, src_elem_bytes={src_elem_bytes}, prefix_bytes={prefix_bytes})" - ) - return False - return True - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = True - ok = compare_bin("golden_v2.bin", "v2.bin", np.float32, 0.0001) and ok - ok = compare_bin("golden_v3.bin", "v3.bin", np.float32, 0.0001) and ok - ok = compare_bin("golden_v4.bin", "v4.bin", np.float32, 0.0001) and ok - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/conversion/vtrc-rounding-boundary/golden.py b/test/vpto/cases/micro-op/conversion/vtrc-rounding-boundary/golden.py deleted file mode 100644 index a39eaa122..000000000 --- a/test/vpto/cases/micro-op/conversion/vtrc-rounding-boundary/golden.py +++ /dev/null @@ -1,58 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -ROWS = 32 -COLS = 32 -SEED = 19 - - -def generate(output_dir: Path, seed: int) -> None: - del seed - boundary = np.array( - [-3.5, -3.0, -2.5, -1.5, -0.5, 0.5, 1.5, 2.5, 3.5], - dtype=np.float32, - ) - v1 = np.resize(boundary, ROWS * COLS).reshape(ROWS, COLS).astype(np.float32) - v2 = np.zeros((ROWS, COLS), dtype=np.float32) - v3 = np.zeros((ROWS, COLS), dtype=np.float32) - v4 = np.zeros((ROWS, COLS), dtype=np.float32) - golden_v2 = np.rint(v1).astype(np.float32, copy=False) - golden_v3 = np.trunc(v1).astype(np.float32, copy=False) - golden_v4 = np.floor(v1).astype(np.float32, copy=False) - - output_dir.mkdir(parents=True, exist_ok=True) - v1.reshape(-1).tofile(output_dir / "v1.bin") - v2.reshape(-1).tofile(output_dir / "v2.bin") - v3.reshape(-1).tofile(output_dir / "v3.bin") - v4.reshape(-1).tofile(output_dir / "v4.bin") - golden_v2.reshape(-1).tofile(output_dir / "golden_v2.bin") - golden_v3.reshape(-1).tofile(output_dir / "golden_v3.bin") - golden_v4.reshape(-1).tofile(output_dir / "golden_v4.bin") - - -def main() -> None: - parser = argparse.ArgumentParser( - description="Generate numpy-based inputs/golden for VPTO micro-op vtrc-f32-rounding validation." - ) - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/conversion/vtrc-rounding-boundary/kernel.pto b/test/vpto/cases/micro-op/conversion/vtrc-rounding-boundary/kernel.pto deleted file mode 100644 index e1756ace6..000000000 --- a/test/vpto/cases/micro-op/conversion/vtrc-rounding-boundary/kernel.pto +++ /dev/null @@ -1,79 +0,0 @@ -// ============================================================================= -// abs_kernel_2d: Element-wise absolute value on a 32x32 f32 tile -// ============================================================================= -// This kernel computes abs(input) for a 32x32 float32 matrix (1024 elements). -// -// Memory Layout: -// - Input: arg0 -> GM (Global Memory) -// - Output: arg1 -> GM (Global Memory) -// - UB (Unified Buffer) at offset 0: input tile (4096 bytes = 32*32*4) -// - UB at offset 4096: output tile (4096 bytes = 32*32*4) -// -// Pipeline: -// 1. DMA: GM -> UB (MTE2 pipe) - copy input tile to UB -// 2. Sync: wait for MTE2 -> V pipe handoff -// 3. Compute: vabs on 64-element vectors (V pipe) - 16 iterations for 1024 elements -// 4. Sync: wait for V -> MTE3 pipe handoff -// 5. DMA: UB -> GM (MTE3 pipe) - copy result tile back to GM -// ============================================================================= - -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vtrc_f32_rounding_kernel_2d(%arg0: !pto.ptr, - %arg1: !pto.ptr, - %arg2: !pto.ptr, - %arg3: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c64 = arith.constant 64 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c128_i64 = arith.constant 128 : i64 - %c4096_i64 = arith.constant 4096 : i64 - %c1024_i32 = arith.constant 1024 : i32 - - %ub_in = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_r = pto.castptr %c4096_i64 : i64 -> !pto.ptr - %c8192_i64 = arith.constant 8192 : i64 - %c12288_i64 = arith.constant 12288 : i64 - %ub_z = pto.castptr %c8192_i64 : i64 -> !pto.ptr - %ub_f = pto.castptr %c12288_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_in, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %_:1 = scf.for %offset = %c0 to %c1024 step %c64 iter_args(%remaining = %c1024_i32) -> (i32) { - %mask, %next_remaining = pto.plt_b32 %remaining : i32 -> !pto.mask, i32 - %vec = pto.vlds %ub_in[%offset] : !pto.ptr -> !pto.vreg<64xf32> - %out_r = pto.vtrc %vec, %mask, "R" : !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> - %out_z = pto.vtrc %vec, %mask, "Z" : !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> - %out_f = pto.vtrc %vec, %mask, "F" : !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> - pto.vsts %out_r, %ub_r[%offset], %mask : !pto.vreg<64xf32>, !pto.ptr, !pto.mask - pto.vsts %out_z, %ub_z[%offset], %mask : !pto.vreg<64xf32>, !pto.ptr, !pto.mask - pto.vsts %out_f, %ub_f[%offset], %mask : !pto.vreg<64xf32>, !pto.ptr, !pto.mask - scf.yield %next_remaining : i32 - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_r, %arg1, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.mte_ub_gm %ub_z, %arg2, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.mte_ub_gm %ub_f, %arg3, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/conversion/vtrc-rounding-boundary/launch.cpp b/test/vpto/cases/micro-op/conversion/vtrc-rounding-boundary/launch.cpp deleted file mode 100644 index 6e4f1a142..000000000 --- a/test/vpto/cases/micro-op/conversion/vtrc-rounding-boundary/launch.cpp +++ /dev/null @@ -1,68 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// --------------------------------------------------------------------------- -// PTOAS compatibility layer -// -// The upstream pto-isa headers reference some FP8/FP4 types and the -// __VEC_SCOPE__ marker that are not available on every AICore arch/toolchain -// combination (e.g. __NPU_ARCH__==2201). -// -// For our PTOAS-generated kernels we don't rely on these types today, but the -// headers still mention them in templates/static_asserts. Provide minimal -// fallbacks to keep compilation working on dav-c220. -// --------------------------------------------------------------------------- -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include - -// AICore printf support is gated behind `--cce-enable-print` on some -// toolchains. When enabled, include the CCE print header so `cce::printf` -// resolves in device compilation. -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -// Some PTO-ISA types are only available in the __CCE_AICORE__ compilation -// path, but `bisheng -xcce` still performs a host-side parse pass. -// Provide minimal fallbacks only when the corresponding header wasn't -// pulled in by the selected arch implementation. -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vtrc_f32_rounding_kernel_2d(__gm__ float *v1, - __gm__ float *v2, - __gm__ float *v3, - __gm__ float *v4); - -void LaunchVtrc_f32_rounding_kernel_2d(float *v1, float *v2, float *v3, - float *v4, void *stream) { - vtrc_f32_rounding_kernel_2d<<<1, nullptr, stream>>>((__gm__ float *)v1, - (__gm__ float *)v2, - (__gm__ float *)v3, - (__gm__ float *)v4); -} diff --git a/test/vpto/cases/micro-op/conversion/vtrc-rounding-boundary/main.cpp b/test/vpto/cases/micro-op/conversion/vtrc-rounding-boundary/main.cpp deleted file mode 100644 index b86de567c..000000000 --- a/test/vpto/cases/micro-op/conversion/vtrc-rounding-boundary/main.cpp +++ /dev/null @@ -1,147 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -This program is free software, you can redistribute it and/or modify it under the terms and conditions of -CANN Open Software License Agreement Version 2.0 (the "License"). -Please refer to the License for details. You may not use this file except in compliance with the License. -THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -See LICENSE in the root of the software repository for the full text of the License. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include -#include - -using namespace PtoTestCommon; - -#ifndef TMRGSORT_HPP -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, (int)_ret, __FILE__, __LINE__); \ - const char *_recent = aclGetRecentErrMsg(); \ - if (_recent != nullptr && _recent[0] != '\0') { \ - std::fprintf(stderr, "[ERROR] RecentErrMsg: %s\n", _recent); \ - } \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVtrc_f32_rounding_kernel_2d(float *v1, float *v2, float *v3, - float *v4, void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(float); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(float); - size_t elemCount_v3 = 1024; - size_t fileSize_v3 = elemCount_v3 * sizeof(float); - size_t elemCount_v4 = 1024; - size_t fileSize_v4 = elemCount_v4 * sizeof(float); - float *v1Host = nullptr; - float *v1Device = nullptr; - float *v2Host = nullptr; - float *v2Device = nullptr; - float *v3Host = nullptr; - float *v3Device = nullptr; - float *v4Host = nullptr; - float *v4Device = nullptr; - - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { - deviceId = std::atoi(envDevice); - } - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMallocHost((void **)(&v3Host), fileSize_v3)); - ACL_CHECK(aclrtMallocHost((void **)(&v4Host), fileSize_v4)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v3Device, fileSize_v3, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v4Device, fileSize_v4, ACL_MEM_MALLOC_HUGE_FIRST)); - - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ReadFile("./v3.bin", fileSize_v3, v3Host, fileSize_v3); - ReadFile("./v4.bin", fileSize_v4, v4Host, fileSize_v4); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v3Device, fileSize_v3, v3Host, fileSize_v3, ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v4Device, fileSize_v4, v4Host, fileSize_v4, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVtrc_f32_rounding_kernel_2d(v1Device, v2Device, v3Device, v4Device, stream); - - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v2Host, fileSize_v2, v2Device, fileSize_v2, ACL_MEMCPY_DEVICE_TO_HOST)); - ACL_CHECK(aclrtMemcpy(v3Host, fileSize_v3, v3Device, fileSize_v3, ACL_MEMCPY_DEVICE_TO_HOST)); - ACL_CHECK(aclrtMemcpy(v4Host, fileSize_v4, v4Device, fileSize_v4, ACL_MEMCPY_DEVICE_TO_HOST)); - - WriteFile("./v2.bin", v2Host, fileSize_v2); - WriteFile("./v3.bin", v3Host, fileSize_v3); - WriteFile("./v4.bin", v4Host, fileSize_v4); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFree(v3Device); - aclrtFree(v4Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - aclrtFreeHost(v3Host); - aclrtFreeHost(v4Host); - if (stream != nullptr) { - const aclError _ret = aclrtDestroyStream(stream); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtDestroyStream(stream)", (int)_ret, __FILE__, __LINE__); - } - stream = nullptr; - } - if (deviceSet) { - const aclError _ret = aclrtResetDevice(deviceId); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtResetDevice(deviceId)", (int)_ret, __FILE__, __LINE__); - } - } - if (aclInited) { - const aclError _ret = aclFinalize(); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclFinalize()", (int)_ret, __FILE__, __LINE__); - } - } - - return rc; -} diff --git a/test/vpto/cases/micro-op/cube-matmul/cube-load-frac-layouts/kernel.pto b/test/vpto/cases/micro-op/cube-matmul/cube-load-frac-layouts/kernel.pto index 0a8307cb3..0a91e995f 100644 --- a/test/vpto/cases/micro-op/cube-matmul/cube-load-frac-layouts/kernel.pto +++ b/test/vpto/cases/micro-op/cube-matmul/cube-load-frac-layouts/kernel.pto @@ -16,94 +16,81 @@ // ----------------------------------------------------------------------------- module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @cube_load_frac_nd2nz_kernel(%src_gm: !pto.ptr, - %id_gm: !pto.ptr, - %out_gm: !pto.ptr) attributes {pto.kernel} { - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c40_i64 = arith.constant 40 : i64 - %c48_i64 = arith.constant 48 : i64 - %c50_i64 = arith.constant 50 : i64 - %c60_i64 = arith.constant 60 : i64 - %c64_i64 = arith.constant 64 : i64 - %c100_i64 = arith.constant 100 : i64 - %c120_i64 = arith.constant 120 : i64 - %c2400_i64 = arith.constant 2400 : i64 - %c46080_i64 = arith.constant 46080 : i64 - %c65536_i64 = arith.constant 65536 : i64 - %false = arith.constant false - - %mat_src = pto.castptr %c0_i64 : i64 -> !pto.ptr - %mat_id = pto.castptr %c65536_i64 : i64 -> !pto.ptr - %l0a = pto.castptr %c0_i64 : i64 -> !pto.ptr - %l0b = pto.castptr %c0_i64 : i64 -> !pto.ptr - %l0c = pto.castptr %c0_i64 : i64 -> !pto.ptr - - pto.mte_gm_l1_frac %id_gm, %mat_src, nd2nz, - shape(%c40_i64, %c50_i64), - src_layout(%c100_i64), - dst_group(%c1_i64, %c1_i64, %c48_i64, %c0_i64), - ctrl(%c0_i64, %false) - : !pto.ptr, !pto.ptr, nd2nz, - shape i64, i64, src_layout(i64), - dst_group i64, i64, i64, i64, ctrl i64, i1 - pto.mte_gm_l1_frac %src_gm, %mat_id, nd2nz, - shape(%c50_i64, %c60_i64), - src_layout(%c120_i64), - dst_group(%c1_i64, %c1_i64, %c64_i64, %c0_i64), - ctrl(%c0_i64, %false) - : !pto.ptr, !pto.ptr, nd2nz, + func.func @cube_load_frac_layouts_deep_merged_kernel(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: !pto.ptr, %arg4: !pto.ptr, %arg5: !pto.ptr, %arg6: !pto.ptr, %arg7: !pto.ptr, %arg8: !pto.ptr, %arg9: !pto.ptr, %arg10: !pto.ptr, %arg11: !pto.ptr, %arg12: !pto.ptr, %arg13: !pto.ptr, %arg14: !pto.ptr, %arg15: !pto.ptr, %arg16: !pto.ptr, %arg17: !pto.ptr) attributes {pto.kernel} { + %__deep_merge_guard = arith.constant false + // inactive merged from cube_load_frac_dn2nz_kernel + scf.if %__deep_merge_guard { + + %c0_i64_m0 = arith.constant 0 : i64 + %c1_i64_m0 = arith.constant 1 : i64 + %c2_i64_m0 = arith.constant 2 : i64 + %c8_i64_m0 = arith.constant 8 : i64 + %c16_i64_m0 = arith.constant 16 : i64 + %c512_i64_m0 = arith.constant 512 : i64 + %false_m0 = arith.constant false + + %mat_id_m0 = pto.castptr %c0_i64_m0 : i64 -> !pto.ptr + %mat_src_m0 = pto.castptr %c512_i64_m0 : i64 -> !pto.ptr + %l0a_m0 = pto.castptr %c0_i64_m0 : i64 -> !pto.ptr + %l0b_m0 = pto.castptr %c0_i64_m0 : i64 -> !pto.ptr + %l0c_m0 = pto.castptr %c0_i64_m0 : i64 -> !pto.ptr + + pto.copy_gm_to_cbuf %arg0, %mat_src_m0, %c16_i64_m0, %c16_i64_m0, %c0_i64_m0, %c0_i64_m0 + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.mte_gm_l1_frac %arg1, %mat_id_m0, dn2nz, + shape(%c8_i64_m0, %c16_i64_m0), + src_layout(%c2_i64_m0), + dst_group(%c1_i64_m0, %c1_i64_m0, %c8_i64_m0, %c0_i64_m0), + ctrl(%c0_i64_m0, %false_m0) + : !pto.ptr, !pto.ptr, dn2nz, shape i64, i64, src_layout(i64), dst_group i64, i64, i64, i64, ctrl i64, i1 pto.set_flag["PIPE_MTE2", "PIPE_MTE1", "EVENT_ID0"] pto.wait_flag["PIPE_MTE2", "PIPE_MTE1", "EVENT_ID0"] - pto.mte_l1_l0a %mat_src, %l0a, %c40_i64, %c50_i64, %c0_i64, %c0_i64 + pto.mte_l1_l0a %mat_src_m0, %l0a_m0, %c16_i64_m0, %c16_i64_m0, %c0_i64_m0, %c0_i64_m0 : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.mte_l1_l0b %mat_id, %l0b, %c50_i64, %c60_i64, %c0_i64, %c0_i64 {transpose = true} + pto.mte_l1_l0b %mat_id_m0, %l0b_m0, %c16_i64_m0, %c16_i64_m0, %c0_i64_m0, %c0_i64_m0 {transpose = true} : !pto.ptr, !pto.ptr, i64, i64, i64, i64 pto.set_flag["PIPE_MTE1", "PIPE_M", "EVENT_ID0"] pto.wait_flag["PIPE_MTE1", "PIPE_M", "EVENT_ID0"] - pto.mad %l0a, %l0b, %l0c, %c40_i64, %c60_i64, %c50_i64 + pto.mad %l0a_m0, %l0b_m0, %l0c_m0, %c16_i64_m0, %c16_i64_m0, %c16_i64_m0 : !pto.ptr, !pto.ptr, !pto.ptr, i64, i64, i64 pto.set_flag["PIPE_M", "PIPE_FIX", "EVENT_ID1"] pto.wait_flag["PIPE_M", "PIPE_FIX", "EVENT_ID1"] - pto.mte_l0c_gm %l0c, %out_gm, %c40_i64, %c60_i64, %c48_i64, %c60_i64, %c0_i64, %c0_i64, - nz2nd, - loop3(%c1_i64, %c46080_i64, %c2400_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64, i64, i64, i64, i64 + pto.mte_l0c_gm %l0c_m0, %arg2, %c16_i64_m0, %c16_i64_m0, %c16_i64_m0, %c16_i64_m0, %c0_i64_m0, %c0_i64_m0, + nz2nd + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64, i64 pto.barrier #pto.pipe - return - } - func.func @cube_load_frac_dn2nz_kernel(%id_gm: !pto.ptr, - %src_gm: !pto.ptr, - %out_gm: !pto.ptr) attributes {pto.kernel} { - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c2_i64 = arith.constant 2 : i64 - %c8_i64 = arith.constant 8 : i64 - %c16_i64 = arith.constant 16 : i64 - %c512_i64 = arith.constant 512 : i64 - %false = arith.constant false - - %mat_id = pto.castptr %c0_i64 : i64 -> !pto.ptr - %mat_src = pto.castptr %c512_i64 : i64 -> !pto.ptr - %l0a = pto.castptr %c0_i64 : i64 -> !pto.ptr - %l0b = pto.castptr %c0_i64 : i64 -> !pto.ptr - %l0c = pto.castptr %c0_i64 : i64 -> !pto.ptr - - pto.copy_gm_to_cbuf %id_gm, %mat_src, %c16_i64, %c16_i64, %c0_i64, %c0_i64 + } + // inactive merged from cube_load_frac_nchw_nc1hwc0_kernel + scf.if %__deep_merge_guard { + + %c0_i64_m1 = arith.constant 0 : i64 + %c1_i64_m1 = arith.constant 1 : i64 + %c10_i64_m1 = arith.constant 10 : i64 + %c16_i64_m1 = arith.constant 16 : i64 + %c32_i64_m1 = arith.constant 32 : i64 + %c512_i64_m1 = arith.constant 512 : i64 + %false_m1 = arith.constant false + + %mat_id_m1 = pto.castptr %c0_i64_m1 : i64 -> !pto.ptr + %mat_src_m1 = pto.castptr %c512_i64_m1 : i64 -> !pto.ptr + %l0a_m1 = pto.castptr %c0_i64_m1 : i64 -> !pto.ptr + %l0b_m1 = pto.castptr %c0_i64_m1 : i64 -> !pto.ptr + %l0c_m1 = pto.castptr %c0_i64_m1 : i64 -> !pto.ptr + + pto.copy_gm_to_cbuf %arg3, %mat_src_m1, %c16_i64_m1, %c16_i64_m1, %c0_i64_m1, %c0_i64_m1 : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.mte_gm_l1_frac %src_gm, %mat_id, dn2nz, - shape(%c8_i64, %c16_i64), - src_layout(%c2_i64), - dst_group(%c1_i64, %c1_i64, %c8_i64, %c0_i64), - ctrl(%c0_i64, %false) + pto.mte_gm_l1_frac %arg4, %mat_id_m1, dn2nz, + shape(%c16_i64_m1, %c10_i64_m1), + src_layout(%c32_i64_m1), + dst_group(%c1_i64_m1, %c1_i64_m1, %c16_i64_m1, %c16_i64_m1), + ctrl(%c0_i64_m1, %false_m1) : !pto.ptr, !pto.ptr, dn2nz, shape i64, i64, src_layout(i64), dst_group i64, i64, i64, i64, ctrl i64, i1 @@ -111,227 +98,237 @@ module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind, !pto.ptr, i64, i64, i64, i64 - pto.mte_l1_l0b %mat_id, %l0b, %c16_i64, %c16_i64, %c0_i64, %c0_i64 {transpose = true} + pto.mte_l1_l0b %mat_id_m1, %l0b_m1, %c16_i64_m1, %c10_i64_m1, %c0_i64_m1, %c0_i64_m1 {transpose = true} : !pto.ptr, !pto.ptr, i64, i64, i64, i64 pto.set_flag["PIPE_MTE1", "PIPE_M", "EVENT_ID0"] pto.wait_flag["PIPE_MTE1", "PIPE_M", "EVENT_ID0"] - pto.mad %l0a, %l0b, %l0c, %c16_i64, %c16_i64, %c16_i64 + pto.mad %l0a_m1, %l0b_m1, %l0c_m1, %c16_i64_m1, %c16_i64_m1, %c16_i64_m1 : !pto.ptr, !pto.ptr, !pto.ptr, i64, i64, i64 pto.set_flag["PIPE_M", "PIPE_FIX", "EVENT_ID1"] pto.wait_flag["PIPE_M", "PIPE_FIX", "EVENT_ID1"] - pto.mte_l0c_gm %l0c, %out_gm, %c16_i64, %c16_i64, %c16_i64, %c16_i64, %c0_i64, %c0_i64, + pto.mte_l0c_gm %l0c_m1, %arg5, %c16_i64_m1, %c16_i64_m1, %c16_i64_m1, %c16_i64_m1, %c0_i64_m1, %c0_i64_m1, nz2nd : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64, i64 pto.barrier #pto.pipe - return - } - func.func @cube_load_frac_nchw_nc1hwc0_kernel(%id_gm: !pto.ptr, - %src_gm: !pto.ptr, - %out_gm: !pto.ptr) attributes {pto.kernel} { - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c10_i64 = arith.constant 10 : i64 - %c16_i64 = arith.constant 16 : i64 - %c32_i64 = arith.constant 32 : i64 - %c512_i64 = arith.constant 512 : i64 - %false = arith.constant false - - %mat_id = pto.castptr %c0_i64 : i64 -> !pto.ptr - %mat_src = pto.castptr %c512_i64 : i64 -> !pto.ptr - %l0a = pto.castptr %c0_i64 : i64 -> !pto.ptr - %l0b = pto.castptr %c0_i64 : i64 -> !pto.ptr - %l0c = pto.castptr %c0_i64 : i64 -> !pto.ptr - - pto.copy_gm_to_cbuf %id_gm, %mat_src, %c16_i64, %c16_i64, %c0_i64, %c0_i64 + } + // inactive merged from cube_load_frac_nchw_fz4d_kernel + scf.if %__deep_merge_guard { + + %c0_i64_m2 = arith.constant 0 : i64 + %c1_i64_m2 = arith.constant 1 : i64 + %c2_i64_m2 = arith.constant 2 : i64 + %c5_i64_m2 = arith.constant 5 : i64 + %c16_i64_m2 = arith.constant 16 : i64 + %c32_i64_m2 = arith.constant 32 : i64 + %c80_i64_m2 = arith.constant 80 : i64 + %c160_i64_m2 = arith.constant 160 : i64 + %c512_i64_m2 = arith.constant 512 : i64 + %false_m2 = arith.constant false + + %mat_id_m2 = pto.castptr %c0_i64_m2 : i64 -> !pto.ptr + %mat_src_m2 = pto.castptr %c512_i64_m2 : i64 -> !pto.ptr + %l0a_m2 = pto.castptr %c0_i64_m2 : i64 -> !pto.ptr + %l0b_m2 = pto.castptr %c0_i64_m2 : i64 -> !pto.ptr + %l0c_m2 = pto.castptr %c0_i64_m2 : i64 -> !pto.ptr + + pto.copy_gm_to_cbuf %arg6, %mat_src_m2, %c16_i64_m2, %c16_i64_m2, %c0_i64_m2, %c0_i64_m2 : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.mte_gm_l1_frac %src_gm, %mat_id, dn2nz, - shape(%c16_i64, %c10_i64), - src_layout(%c32_i64), - dst_group(%c1_i64, %c1_i64, %c16_i64, %c16_i64), - ctrl(%c0_i64, %false) + pto.mte_gm_l1_frac %arg7, %mat_id_m2, dn2nz, + shape(%c16_i64_m2, %c5_i64_m2), + src_layout(%c32_i64_m2, %c160_i64_m2), + dst_group(%c1_i64_m2, %c16_i64_m2, %c80_i64_m2, %c1_i64_m2), + ctrl(%c0_i64_m2, %false_m2) : !pto.ptr, !pto.ptr, dn2nz, - shape i64, i64, src_layout(i64), + shape i64, i64, src_layout(i64, i64), dst_group i64, i64, i64, i64, ctrl i64, i1 pto.set_flag["PIPE_MTE2", "PIPE_MTE1", "EVENT_ID0"] pto.wait_flag["PIPE_MTE2", "PIPE_MTE1", "EVENT_ID0"] - pto.mte_l1_l0a %mat_src, %l0a, %c16_i64, %c16_i64, %c0_i64, %c0_i64 + pto.mte_l1_l0a %mat_src_m2, %l0a_m2, %c16_i64_m2, %c16_i64_m2, %c0_i64_m2, %c0_i64_m2 : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.mte_l1_l0b %mat_id, %l0b, %c16_i64, %c10_i64, %c0_i64, %c0_i64 {transpose = true} + pto.mte_l1_l0b %mat_id_m2, %l0b_m2, %c16_i64_m2, %c16_i64_m2, %c0_i64_m2, %c0_i64_m2 {transpose = true} : !pto.ptr, !pto.ptr, i64, i64, i64, i64 pto.set_flag["PIPE_MTE1", "PIPE_M", "EVENT_ID0"] pto.wait_flag["PIPE_MTE1", "PIPE_M", "EVENT_ID0"] - pto.mad %l0a, %l0b, %l0c, %c16_i64, %c16_i64, %c16_i64 + pto.mad %l0a_m2, %l0b_m2, %l0c_m2, %c16_i64_m2, %c16_i64_m2, %c16_i64_m2 : !pto.ptr, !pto.ptr, !pto.ptr, i64, i64, i64 pto.set_flag["PIPE_M", "PIPE_FIX", "EVENT_ID1"] pto.wait_flag["PIPE_M", "PIPE_FIX", "EVENT_ID1"] - pto.mte_l0c_gm %l0c, %out_gm, %c16_i64, %c16_i64, %c16_i64, %c16_i64, %c0_i64, %c0_i64, + pto.mte_l0c_gm %l0c_m2, %arg8, %c16_i64_m2, %c16_i64_m2, %c16_i64_m2, %c16_i64_m2, %c0_i64_m2, %c0_i64_m2, nz2nd : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64, i64 pto.barrier #pto.pipe - return - } - func.func @cube_load_frac_nchw_fz4d_kernel(%id_gm: !pto.ptr, - %src_gm: !pto.ptr, - %out_gm: !pto.ptr) attributes {pto.kernel} { - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c2_i64 = arith.constant 2 : i64 - %c5_i64 = arith.constant 5 : i64 - %c16_i64 = arith.constant 16 : i64 - %c32_i64 = arith.constant 32 : i64 - %c80_i64 = arith.constant 80 : i64 - %c160_i64 = arith.constant 160 : i64 - %c512_i64 = arith.constant 512 : i64 - %false = arith.constant false - - %mat_id = pto.castptr %c0_i64 : i64 -> !pto.ptr - %mat_src = pto.castptr %c512_i64 : i64 -> !pto.ptr - %l0a = pto.castptr %c0_i64 : i64 -> !pto.ptr - %l0b = pto.castptr %c0_i64 : i64 -> !pto.ptr - %l0c = pto.castptr %c0_i64 : i64 -> !pto.ptr - - pto.copy_gm_to_cbuf %id_gm, %mat_src, %c16_i64, %c16_i64, %c0_i64, %c0_i64 + } + // inactive merged from cube_load_frac_ncdhw_ndc1hwc0_kernel + scf.if %__deep_merge_guard { + + %c0_i64_m3 = arith.constant 0 : i64 + %c1_i64_m3 = arith.constant 1 : i64 + %c7_i64_m3 = arith.constant 7 : i64 + %c16_i64_m3 = arith.constant 16 : i64 + %c32_i64_m3 = arith.constant 32 : i64 + %c512_i64_m3 = arith.constant 512 : i64 + %false_m3 = arith.constant false + + %mat_id_m3 = pto.castptr %c0_i64_m3 : i64 -> !pto.ptr + %mat_src_m3 = pto.castptr %c512_i64_m3 : i64 -> !pto.ptr + %l0a_m3 = pto.castptr %c0_i64_m3 : i64 -> !pto.ptr + %l0b_m3 = pto.castptr %c0_i64_m3 : i64 -> !pto.ptr + %l0c_m3 = pto.castptr %c0_i64_m3 : i64 -> !pto.ptr + + pto.copy_gm_to_cbuf %arg9, %mat_src_m3, %c16_i64_m3, %c16_i64_m3, %c0_i64_m3, %c0_i64_m3 : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.mte_gm_l1_frac %src_gm, %mat_id, dn2nz, - shape(%c16_i64, %c5_i64), - src_layout(%c32_i64, %c160_i64), - dst_group(%c1_i64, %c16_i64, %c80_i64, %c1_i64), - ctrl(%c0_i64, %false) + pto.mte_gm_l1_frac %arg10, %mat_id_m3, dn2nz, + shape(%c16_i64_m3, %c7_i64_m3), + src_layout(%c32_i64_m3), + dst_group(%c1_i64_m3, %c1_i64_m3, %c16_i64_m3, %c16_i64_m3), + ctrl(%c0_i64_m3, %false_m3) : !pto.ptr, !pto.ptr, dn2nz, - shape i64, i64, src_layout(i64, i64), + shape i64, i64, src_layout(i64), dst_group i64, i64, i64, i64, ctrl i64, i1 pto.set_flag["PIPE_MTE2", "PIPE_MTE1", "EVENT_ID0"] pto.wait_flag["PIPE_MTE2", "PIPE_MTE1", "EVENT_ID0"] - pto.mte_l1_l0a %mat_src, %l0a, %c16_i64, %c16_i64, %c0_i64, %c0_i64 + pto.mte_l1_l0a %mat_src_m3, %l0a_m3, %c16_i64_m3, %c16_i64_m3, %c0_i64_m3, %c0_i64_m3 : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.mte_l1_l0b %mat_id, %l0b, %c16_i64, %c16_i64, %c0_i64, %c0_i64 {transpose = true} + pto.mte_l1_l0b %mat_id_m3, %l0b_m3, %c16_i64_m3, %c7_i64_m3, %c0_i64_m3, %c0_i64_m3 {transpose = true} : !pto.ptr, !pto.ptr, i64, i64, i64, i64 pto.set_flag["PIPE_MTE1", "PIPE_M", "EVENT_ID0"] pto.wait_flag["PIPE_MTE1", "PIPE_M", "EVENT_ID0"] - pto.mad %l0a, %l0b, %l0c, %c16_i64, %c16_i64, %c16_i64 + pto.mad %l0a_m3, %l0b_m3, %l0c_m3, %c16_i64_m3, %c16_i64_m3, %c16_i64_m3 : !pto.ptr, !pto.ptr, !pto.ptr, i64, i64, i64 pto.set_flag["PIPE_M", "PIPE_FIX", "EVENT_ID1"] pto.wait_flag["PIPE_M", "PIPE_FIX", "EVENT_ID1"] - pto.mte_l0c_gm %l0c, %out_gm, %c16_i64, %c16_i64, %c16_i64, %c16_i64, %c0_i64, %c0_i64, + pto.mte_l0c_gm %l0c_m3, %arg11, %c16_i64_m3, %c16_i64_m3, %c16_i64_m3, %c16_i64_m3, %c0_i64_m3, %c0_i64_m3, nz2nd : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64, i64 pto.barrier #pto.pipe - return - } - func.func @cube_load_frac_ncdhw_ndc1hwc0_kernel(%id_gm: !pto.ptr, - %src_gm: !pto.ptr, - %out_gm: !pto.ptr) attributes {pto.kernel} { - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c7_i64 = arith.constant 7 : i64 - %c16_i64 = arith.constant 16 : i64 - %c32_i64 = arith.constant 32 : i64 - %c512_i64 = arith.constant 512 : i64 - %false = arith.constant false - - %mat_id = pto.castptr %c0_i64 : i64 -> !pto.ptr - %mat_src = pto.castptr %c512_i64 : i64 -> !pto.ptr - %l0a = pto.castptr %c0_i64 : i64 -> !pto.ptr - %l0b = pto.castptr %c0_i64 : i64 -> !pto.ptr - %l0c = pto.castptr %c0_i64 : i64 -> !pto.ptr - - pto.copy_gm_to_cbuf %id_gm, %mat_src, %c16_i64, %c16_i64, %c0_i64, %c0_i64 + } + // inactive merged from cube_load_frac_ncdhw_fz3d_kernel + scf.if %__deep_merge_guard { + + %c0_i64_m4 = arith.constant 0 : i64 + %c1_i64_m4 = arith.constant 1 : i64 + %c2_i64_m4 = arith.constant 2 : i64 + %c3_i64_m4 = arith.constant 3 : i64 + %c16_i64_m4 = arith.constant 16 : i64 + %c32_i64_m4 = arith.constant 32 : i64 + %c512_i64_m4 = arith.constant 512 : i64 + %c68720525568_i64_m4 = arith.constant 68720525568 : i64 + %false_m4 = arith.constant false + + %mat_id_m4 = pto.castptr %c0_i64_m4 : i64 -> !pto.ptr + %mat_src_m4 = pto.castptr %c512_i64_m4 : i64 -> !pto.ptr + %l0a_m4 = pto.castptr %c0_i64_m4 : i64 -> !pto.ptr + %l0b_m4 = pto.castptr %c0_i64_m4 : i64 -> !pto.ptr + %l0c_m4 = pto.castptr %c0_i64_m4 : i64 -> !pto.ptr + + pto.copy_gm_to_cbuf %arg12, %mat_src_m4, %c16_i64_m4, %c16_i64_m4, %c0_i64_m4, %c0_i64_m4 : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.mte_gm_l1_frac %src_gm, %mat_id, dn2nz, - shape(%c16_i64, %c7_i64), - src_layout(%c32_i64), - dst_group(%c1_i64, %c1_i64, %c16_i64, %c16_i64), - ctrl(%c0_i64, %false) + pto.mte_gm_l1_frac %arg13, %mat_id_m4, dn2nz, + shape(%c1_i64_m4, %c16_i64_m4), + src_layout(%c2_i64_m4, %c32_i64_m4), + dst_group(%c3_i64_m4, %c16_i64_m4, %c16_i64_m4, %c1_i64_m4), + ctrl(%c0_i64_m4, %false_m4) : !pto.ptr, !pto.ptr, dn2nz, - shape i64, i64, src_layout(i64), + shape i64, i64, src_layout(i64, i64), dst_group i64, i64, i64, i64, ctrl i64, i1 pto.set_flag["PIPE_MTE2", "PIPE_MTE1", "EVENT_ID0"] pto.wait_flag["PIPE_MTE2", "PIPE_MTE1", "EVENT_ID0"] - pto.mte_l1_l0a %mat_src, %l0a, %c16_i64, %c16_i64, %c0_i64, %c0_i64 + pto.mte_l1_l0a %mat_src_m4, %l0a_m4, %c16_i64_m4, %c16_i64_m4, %c0_i64_m4, %c0_i64_m4 : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.mte_l1_l0b %mat_id, %l0b, %c16_i64, %c7_i64, %c0_i64, %c0_i64 {transpose = true} + pto.mte_l1_l0b %mat_id_m4, %l0b_m4, %c16_i64_m4, %c16_i64_m4, %c0_i64_m4, %c0_i64_m4 {transpose = true} : !pto.ptr, !pto.ptr, i64, i64, i64, i64 pto.set_flag["PIPE_MTE1", "PIPE_M", "EVENT_ID0"] pto.wait_flag["PIPE_MTE1", "PIPE_M", "EVENT_ID0"] - pto.mad %l0a, %l0b, %l0c, %c16_i64, %c16_i64, %c16_i64 + pto.mad %l0a_m4, %l0b_m4, %l0c_m4, %c16_i64_m4, %c16_i64_m4, %c16_i64_m4 : !pto.ptr, !pto.ptr, !pto.ptr, i64, i64, i64 pto.set_flag["PIPE_M", "PIPE_FIX", "EVENT_ID1"] pto.wait_flag["PIPE_M", "PIPE_FIX", "EVENT_ID1"] - pto.mte_l0c_gm %l0c, %out_gm, %c16_i64, %c16_i64, %c16_i64, %c16_i64, %c0_i64, %c0_i64, + pto.mte_l0c_gm %l0c_m4, %arg14, %c16_i64_m4, %c16_i64_m4, %c16_i64_m4, %c16_i64_m4, %c0_i64_m4, %c0_i64_m4, nz2nd : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64, i64 pto.barrier #pto.pipe - return - } - func.func @cube_load_frac_ncdhw_fz3d_kernel(%id_gm: !pto.ptr, - %src_gm: !pto.ptr, - %out_gm: !pto.ptr) attributes {pto.kernel} { - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c2_i64 = arith.constant 2 : i64 - %c3_i64 = arith.constant 3 : i64 - %c16_i64 = arith.constant 16 : i64 - %c32_i64 = arith.constant 32 : i64 - %c512_i64 = arith.constant 512 : i64 - %c68720525568_i64 = arith.constant 68720525568 : i64 - %false = arith.constant false - - %mat_id = pto.castptr %c0_i64 : i64 -> !pto.ptr - %mat_src = pto.castptr %c512_i64 : i64 -> !pto.ptr - %l0a = pto.castptr %c0_i64 : i64 -> !pto.ptr - %l0b = pto.castptr %c0_i64 : i64 -> !pto.ptr - %l0c = pto.castptr %c0_i64 : i64 -> !pto.ptr - - pto.copy_gm_to_cbuf %id_gm, %mat_src, %c16_i64, %c16_i64, %c0_i64, %c0_i64 - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.mte_gm_l1_frac %src_gm, %mat_id, dn2nz, - shape(%c1_i64, %c16_i64), - src_layout(%c2_i64, %c32_i64), - dst_group(%c3_i64, %c16_i64, %c16_i64, %c1_i64), - ctrl(%c0_i64, %false) - : !pto.ptr, !pto.ptr, dn2nz, - shape i64, i64, src_layout(i64, i64), + } + // active merged from cube_load_frac_nd2nz_kernel + + %c0_i64_m5 = arith.constant 0 : i64 + %c1_i64_m5 = arith.constant 1 : i64 + %c32_i64_m5 = arith.constant 32 : i64 + %c40_i64_m5 = arith.constant 40 : i64 + %c48_i64_m5 = arith.constant 48 : i64 + %c50_i64_m5 = arith.constant 50 : i64 + %c60_i64_m5 = arith.constant 60 : i64 + %c64_i64_m5 = arith.constant 64 : i64 + %c100_i64_m5 = arith.constant 100 : i64 + %c120_i64_m5 = arith.constant 120 : i64 + %c2400_i64_m5 = arith.constant 2400 : i64 + %c46080_i64_m5 = arith.constant 46080 : i64 + %c65536_i64_m5 = arith.constant 65536 : i64 + %false_m5 = arith.constant false + + %mat_src_m5 = pto.castptr %c0_i64_m5 : i64 -> !pto.ptr + %mat_id_m5 = pto.castptr %c65536_i64_m5 : i64 -> !pto.ptr + %l0a_m5 = pto.castptr %c0_i64_m5 : i64 -> !pto.ptr + %l0b_m5 = pto.castptr %c0_i64_m5 : i64 -> !pto.ptr + %l0c_m5 = pto.castptr %c0_i64_m5 : i64 -> !pto.ptr + + pto.mte_gm_l1_frac %arg16, %mat_src_m5, nd2nz, + shape(%c40_i64_m5, %c50_i64_m5), + src_layout(%c100_i64_m5), + dst_group(%c1_i64_m5, %c1_i64_m5, %c48_i64_m5, %c0_i64_m5), + ctrl(%c0_i64_m5, %false_m5) + : !pto.ptr, !pto.ptr, nd2nz, + shape i64, i64, src_layout(i64), + dst_group i64, i64, i64, i64, ctrl i64, i1 + pto.mte_gm_l1_frac %arg15, %mat_id_m5, nd2nz, + shape(%c50_i64_m5, %c60_i64_m5), + src_layout(%c120_i64_m5), + dst_group(%c1_i64_m5, %c1_i64_m5, %c64_i64_m5, %c0_i64_m5), + ctrl(%c0_i64_m5, %false_m5) + : !pto.ptr, !pto.ptr, nd2nz, + shape i64, i64, src_layout(i64), dst_group i64, i64, i64, i64, ctrl i64, i1 pto.set_flag["PIPE_MTE2", "PIPE_MTE1", "EVENT_ID0"] pto.wait_flag["PIPE_MTE2", "PIPE_MTE1", "EVENT_ID0"] - pto.mte_l1_l0a %mat_src, %l0a, %c16_i64, %c16_i64, %c0_i64, %c0_i64 + pto.mte_l1_l0a %mat_src_m5, %l0a_m5, %c40_i64_m5, %c50_i64_m5, %c0_i64_m5, %c0_i64_m5 : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.mte_l1_l0b %mat_id, %l0b, %c16_i64, %c16_i64, %c0_i64, %c0_i64 {transpose = true} + pto.mte_l1_l0b %mat_id_m5, %l0b_m5, %c50_i64_m5, %c60_i64_m5, %c0_i64_m5, %c0_i64_m5 {transpose = true} : !pto.ptr, !pto.ptr, i64, i64, i64, i64 pto.set_flag["PIPE_MTE1", "PIPE_M", "EVENT_ID0"] pto.wait_flag["PIPE_MTE1", "PIPE_M", "EVENT_ID0"] - pto.mad %l0a, %l0b, %l0c, %c16_i64, %c16_i64, %c16_i64 + pto.mad %l0a_m5, %l0b_m5, %l0c_m5, %c40_i64_m5, %c60_i64_m5, %c50_i64_m5 : !pto.ptr, !pto.ptr, !pto.ptr, i64, i64, i64 pto.set_flag["PIPE_M", "PIPE_FIX", "EVENT_ID1"] pto.wait_flag["PIPE_M", "PIPE_FIX", "EVENT_ID1"] - pto.mte_l0c_gm %l0c, %out_gm, %c16_i64, %c16_i64, %c16_i64, %c16_i64, %c0_i64, %c0_i64, - nz2nd - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64, i64 + pto.mte_l0c_gm %l0c_m5, %arg17, %c40_i64_m5, %c60_i64_m5, %c48_i64_m5, %c60_i64_m5, %c0_i64_m5, %c0_i64_m5, + nz2nd, + loop3(%c1_i64_m5, %c46080_i64_m5, %c2400_i64_m5) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64, i64, i64, i64, i64 pto.barrier #pto.pipe + return } } diff --git a/test/vpto/cases/micro-op/cube-matmul/cube-load-frac-layouts/launch.cpp b/test/vpto/cases/micro-op/cube-matmul/cube-load-frac-layouts/launch.cpp index 5af5bfd7a..82062e879 100644 --- a/test/vpto/cases/micro-op/cube-matmul/cube-load-frac-layouts/launch.cpp +++ b/test/vpto/cases/micro-op/cube-matmul/cube-load-frac-layouts/launch.cpp @@ -36,43 +36,44 @@ struct MrgSortExecutedNumList { #include "acl/acl.h" #endif -extern "C" __global__ [aicore] void cube_load_frac_nd2nz_kernel(__gm__ __fp16 *src, __gm__ __fp16 *id, - __gm__ float *out); -extern "C" __global__ [aicore] void cube_load_frac_dn2nz_kernel(__gm__ __fp16 *id, __gm__ __fp16 *src, - __gm__ float *out); -extern "C" __global__ [aicore] void cube_load_frac_nchw_nc1hwc0_kernel(__gm__ __fp16 *id, __gm__ __fp16 *src, - __gm__ float *out); -extern "C" __global__ [aicore] void cube_load_frac_nchw_fz4d_kernel(__gm__ __fp16 *id, __gm__ __fp16 *src, - __gm__ float *out); -extern "C" __global__ [aicore] void cube_load_frac_ncdhw_ndc1hwc0_kernel(__gm__ __fp16 *id, __gm__ __fp16 *src, - __gm__ float *out); -extern "C" __global__ [aicore] void cube_load_frac_ncdhw_fz3d_kernel(__gm__ __fp16 *id, __gm__ __fp16 *src, - __gm__ float *out); +extern "C" __global__ [aicore] void cube_load_frac_layouts_deep_merged_kernel( + __gm__ half * arg0, + __gm__ half * arg1, + __gm__ float * arg2, + __gm__ half * arg3, + __gm__ half * arg4, + __gm__ float * arg5, + __gm__ half * arg6, + __gm__ half * arg7, + __gm__ float * arg8, + __gm__ half * arg9, + __gm__ half * arg10, + __gm__ float * arg11, + __gm__ half * arg12, + __gm__ half * arg13, + __gm__ float * arg14, + __gm__ half * arg15, + __gm__ half * arg16, + __gm__ float * arg17); -void LaunchCube_load_frac_nd2nz_kernel(__fp16 *src, __fp16 *id, float *out, void *stream) { - cube_load_frac_nd2nz_kernel<<<1, nullptr, stream>>>((__gm__ __fp16 *)src, (__gm__ __fp16 *)id, (__gm__ float *)out); -} - -void LaunchCube_load_frac_dn2nz_kernel(__fp16 *id, __fp16 *src, float *out, void *stream) { - cube_load_frac_dn2nz_kernel<<<1, nullptr, stream>>>((__gm__ __fp16 *)id, (__gm__ __fp16 *)src, (__gm__ float *)out); -} - -void LaunchCube_load_frac_nchw_nc1hwc0_kernel(__fp16 *id, __fp16 *src, float *out, void *stream) { - cube_load_frac_nchw_nc1hwc0_kernel<<<1, nullptr, stream>>>((__gm__ __fp16 *)id, (__gm__ __fp16 *)src, - (__gm__ float *)out); -} - -void LaunchCube_load_frac_nchw_fz4d_kernel(__fp16 *id, __fp16 *src, float *out, void *stream) { - cube_load_frac_nchw_fz4d_kernel<<<1, nullptr, stream>>>((__gm__ __fp16 *)id, (__gm__ __fp16 *)src, - (__gm__ float *)out); -} - -void LaunchCube_load_frac_ncdhw_ndc1hwc0_kernel(__fp16 *id, __fp16 *src, float *out, void *stream) { - cube_load_frac_ncdhw_ndc1hwc0_kernel<<<1, nullptr, stream>>>((__gm__ __fp16 *)id, (__gm__ __fp16 *)src, - (__gm__ float *)out); -} - -void LaunchCube_load_frac_ncdhw_fz3d_kernel(__fp16 *id, __fp16 *src, float *out, void *stream) { - cube_load_frac_ncdhw_fz3d_kernel<<<1, nullptr, stream>>>((__gm__ __fp16 *)id, (__gm__ __fp16 *)src, - (__gm__ float *)out); +void LaunchCubeLoadFracLayoutsDeepMerged(__fp16 * p0, __fp16 * p1, float * p2, void *stream) { + cube_load_frac_layouts_deep_merged_kernel<<<1, nullptr, stream>>>( + (__gm__ half *)p0, + (__gm__ half *)p0, + (__gm__ float *)p0, + (__gm__ half *)p0, + (__gm__ half *)p0, + (__gm__ float *)p0, + (__gm__ half *)p0, + (__gm__ half *)p0, + (__gm__ float *)p0, + (__gm__ half *)p0, + (__gm__ half *)p0, + (__gm__ float *)p0, + (__gm__ half *)p0, + (__gm__ half *)p0, + (__gm__ float *)p0, + (__gm__ half *)p0, + (__gm__ half *)p1, + (__gm__ float *)p2); } diff --git a/test/vpto/cases/micro-op/cube-matmul/cube-load-frac-layouts/main.cpp b/test/vpto/cases/micro-op/cube-matmul/cube-load-frac-layouts/main.cpp index 301c4d053..bc73a2587 100644 --- a/test/vpto/cases/micro-op/cube-matmul/cube-load-frac-layouts/main.cpp +++ b/test/vpto/cases/micro-op/cube-matmul/cube-load-frac-layouts/main.cpp @@ -47,7 +47,6 @@ struct MrgSortExecutedNumList { } \ } while (0) -void LaunchCube_load_frac_nd2nz_kernel(__fp16 *src, __fp16 *id, float *out, void *stream); static bool readExact(const char *path, void *dst, size_t size) { size_t inputSize = size; @@ -58,6 +57,7 @@ static bool writeExact(const char *path, void *src, size_t size) { return WriteFile(path, src, size); } +void LaunchCubeLoadFracLayoutsDeepMerged(__fp16 * p0, __fp16 * p1, float * p2, void *stream); int main() { constexpr size_t kNd2NzCase1LhsElem = 40 * 50; constexpr size_t kNd2NzCase1RhsElem = 50 * 60; @@ -114,8 +114,7 @@ int main() { ACL_CHECK(aclrtMemcpy(outNd2nzCase1Device, kNd2NzCase1OutSize, outNd2nzCase1Host, kNd2NzCase1OutSize, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchCube_load_frac_nd2nz_kernel(nd2nzCase1RhsDevice, nd2nzCase1LhsDevice, - outNd2nzCase1Device, stream); + LaunchCubeLoadFracLayoutsDeepMerged(nd2nzCase1RhsDevice, nd2nzCase1LhsDevice, outNd2nzCase1Device, stream); ACL_CHECK(aclrtSynchronizeStream(stream)); ACL_CHECK(aclrtMemcpy(outNd2nzCase1Host, kNd2NzCase1OutSize, outNd2nzCase1Device, diff --git a/test/vpto/cases/micro-op/dsa-sfu/vci-f16/compare.py b/test/vpto/cases/micro-op/dsa-sfu/vci-f16/compare.py deleted file mode 100755 index 8c2628b88..000000000 --- a/test/vpto/cases/micro-op/dsa-sfu/vci-f16/compare.py +++ /dev/null @@ -1,42 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/dsa-sfu/vci -# family: dsa-sfu / conversion -# target_ops: pto.vci -# scenarios: index-generation -# NOTE: bulk-generated coverage skeleton. - -import os -import sys -import numpy as np - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(golden_path) or not os.path.exists(output_path): - return False - golden = np.fromfile(golden_path, dtype=dtype) - output = np.fromfile(output_path, dtype=dtype) - return golden.shape == output.shape and np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True) - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_bin("golden_v2.bin", "v2.bin", np.float16, 0.001) - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/dsa-sfu/vci-f16/golden.py b/test/vpto/cases/micro-op/dsa-sfu/vci-f16/golden.py deleted file mode 100755 index c19fcdb99..000000000 --- a/test/vpto/cases/micro-op/dsa-sfu/vci-f16/golden.py +++ /dev/null @@ -1,49 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/dsa-sfu/vci -# family: dsa-sfu / conversion -# target_ops: pto.vci -# scenarios: index-generation -# NOTE: bulk-generated coverage skeleton. -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -ROWS = 1 -COLS = 128 -SEED = 19 - - -def generate(output_dir: Path, seed: int) -> None: - _ = seed - v1 = np.zeros((ROWS, COLS), dtype=np.float16) - v2 = np.zeros((ROWS, COLS), dtype=np.float16) - golden_v2 = np.arange(ROWS * COLS, dtype=np.float16).reshape(ROWS, COLS) - - output_dir.mkdir(parents=True, exist_ok=True) - v1.reshape(-1).tofile(output_dir / "v1.bin") - v2.reshape(-1).tofile(output_dir / "v2.bin") - golden_v2.reshape(-1).tofile(output_dir / "golden_v2.bin") - - -def main() -> None: - parser = argparse.ArgumentParser() - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/dsa-sfu/vci-f16/kernel.pto b/test/vpto/cases/micro-op/dsa-sfu/vci-f16/kernel.pto deleted file mode 100644 index 52410408f..000000000 --- a/test/vpto/cases/micro-op/dsa-sfu/vci-f16/kernel.pto +++ /dev/null @@ -1,26 +0,0 @@ -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vci_kernel_2d(%arg0: !pto.ptr, - %arg1: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c0_i64 = arith.constant 0 : i64 - %c2_i64 = arith.constant 2 : i64 - %c128_i64 = arith.constant 128 : i64 - %cst = arith.constant 0.0 : f16 - - %ub_out = pto.castptr %c0_i64 : i64 -> !pto.ptr - - pto.vecscope { - %mask = pto.pset_b16 "PAT_ALL" : !pto.mask - %indices = pto.vci %cst {order = "ASC"} : f16 -> !pto.vreg<128xf16> - pto.vsts %indices, %ub_out[%c0], %mask : !pto.vreg<128xf16>, !pto.ptr, !pto.mask - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg1, %c128_i64 - nburst(%c2_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/dsa-sfu/vci-f16/launch.cpp b/test/vpto/cases/micro-op/dsa-sfu/vci-f16/launch.cpp deleted file mode 100644 index 8647dab79..000000000 --- a/test/vpto/cases/micro-op/dsa-sfu/vci-f16/launch.cpp +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vci_kernel_2d(__gm__ half *v1, - __gm__ half *v2); - -void LaunchVci_kernel_2d(aclFloat16 *v1, aclFloat16 *v2, void *stream) { - vci_kernel_2d<<<1, nullptr, stream>>>((__gm__ half *)v1, - (__gm__ half *)v2); -} diff --git a/test/vpto/cases/micro-op/dsa-sfu/vci-f16/main.cpp b/test/vpto/cases/micro-op/dsa-sfu/vci-f16/main.cpp deleted file mode 100644 index b628b0747..000000000 --- a/test/vpto/cases/micro-op/dsa-sfu/vci-f16/main.cpp +++ /dev/null @@ -1,79 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -#include "test_common.h" -#include "acl/acl.h" -#include -#include - -using namespace PtoTestCommon; - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, \ - (int)_ret, __FILE__, __LINE__); \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVci_kernel_2d(aclFloat16 *v1, aclFloat16 *v2, void *stream); - -int main() { - size_t elemCount_v1 = 128; - size_t fileSize_v1 = elemCount_v1 * sizeof(aclFloat16); - size_t elemCount_v2 = 128; - size_t fileSize_v2 = elemCount_v2 * sizeof(aclFloat16); - aclFloat16 *v1Host = nullptr; - aclFloat16 *v1Device = nullptr; - aclFloat16 *v2Host = nullptr; - aclFloat16 *v2Device = nullptr; - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) - deviceId = std::atoi(envDevice); - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, - ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVci_kernel_2d(v1Device, v2Device, stream); - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v2Host, fileSize_v2, v2Device, fileSize_v2, - ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v2.bin", v2Host, fileSize_v2); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - if (stream != nullptr) - aclrtDestroyStream(stream); - if (deviceSet) - aclrtResetDevice(deviceId); - if (aclInited) - aclFinalize(); - return rc; -} diff --git a/test/vpto/cases/micro-op/dsa-sfu/vci-si8/compare.py b/test/vpto/cases/micro-op/dsa-sfu/vci-si8/compare.py deleted file mode 100755 index 326fa7450..000000000 --- a/test/vpto/cases/micro-op/dsa-sfu/vci-si8/compare.py +++ /dev/null @@ -1,42 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/dsa-sfu/vci -# family: dsa-sfu / conversion -# target_ops: pto.vci -# scenarios: index-generation -# NOTE: bulk-generated coverage skeleton. - -import os -import sys -import numpy as np - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(golden_path) or not os.path.exists(output_path): - return False - golden = np.fromfile(golden_path, dtype=dtype) - output = np.fromfile(output_path, dtype=dtype) - return golden.shape == output.shape and np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True) - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_bin("golden_v2.bin", "v2.bin", np.int8, 0) - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/dsa-sfu/vci-si8/golden.py b/test/vpto/cases/micro-op/dsa-sfu/vci-si8/golden.py deleted file mode 100755 index b3482d94d..000000000 --- a/test/vpto/cases/micro-op/dsa-sfu/vci-si8/golden.py +++ /dev/null @@ -1,49 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/dsa-sfu/vci -# family: dsa-sfu / conversion -# target_ops: pto.vci -# scenarios: index-generation -# NOTE: bulk-generated coverage skeleton. -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -ROWS = 32 -COLS = 32 -SEED = 19 - - -def generate(output_dir: Path, seed: int) -> None: - _ = seed - v1 = np.zeros((ROWS, COLS), dtype=np.int8) - v2 = np.zeros((ROWS, COLS), dtype=np.int8) - golden_v2 = np.arange(ROWS * COLS, dtype=np.int32).astype(np.int8).reshape(ROWS, COLS) - - output_dir.mkdir(parents=True, exist_ok=True) - v1.reshape(-1).tofile(output_dir / "v1.bin") - v2.reshape(-1).tofile(output_dir / "v2.bin") - golden_v2.reshape(-1).tofile(output_dir / "golden_v2.bin") - - -def main() -> None: - parser = argparse.ArgumentParser() - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/dsa-sfu/vci-si8/kernel.pto b/test/vpto/cases/micro-op/dsa-sfu/vci-si8/kernel.pto deleted file mode 100644 index 5782e7d51..000000000 --- a/test/vpto/cases/micro-op/dsa-sfu/vci-si8/kernel.pto +++ /dev/null @@ -1,32 +0,0 @@ -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vci_kernel_2d(%arg0: !pto.ptr, - %arg1: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c256 = arith.constant 256 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c8_i64 = arith.constant 8 : i64 - %c128_i64 = arith.constant 128 : i64 - %c1024_i32 = arith.constant 1024 : i32 - - %ub_out = pto.castptr %c0_i64 : i64 -> !pto.ptr - - pto.vecscope { - %_:1 = scf.for %offset = %c0 to %c1024 step %c256 iter_args(%remaining = %c1024_i32) -> (i32) { - %mask, %next_remaining = pto.plt_b8 %remaining : i32 -> !pto.mask, i32 - %base = arith.index_castui %offset : index to i8 - %indices = pto.vci %base {order = "ASC"} : i8 -> !pto.vreg<256xsi8> - pto.vsts %indices, %ub_out[%offset], %mask : !pto.vreg<256xsi8>, !pto.ptr, !pto.mask - scf.yield %next_remaining : i32 - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg1, %c128_i64 - nburst(%c8_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/dsa-sfu/vci-si8/launch.cpp b/test/vpto/cases/micro-op/dsa-sfu/vci-si8/launch.cpp deleted file mode 100644 index 0b3f33084..000000000 --- a/test/vpto/cases/micro-op/dsa-sfu/vci-si8/launch.cpp +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vci_kernel_2d(__gm__ int8_t *v1, - __gm__ int8_t *v2); - -void LaunchVci_kernel_2d(int8_t *v1, int8_t *v2, void *stream) { - vci_kernel_2d<<<1, nullptr, stream>>>((__gm__ int8_t *)v1, - (__gm__ int8_t *)v2); -} diff --git a/test/vpto/cases/micro-op/dsa-sfu/vci-si8/main.cpp b/test/vpto/cases/micro-op/dsa-sfu/vci-si8/main.cpp deleted file mode 100644 index 204d6efa9..000000000 --- a/test/vpto/cases/micro-op/dsa-sfu/vci-si8/main.cpp +++ /dev/null @@ -1,80 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -#include "test_common.h" -#include "acl/acl.h" -#include -#include -#include - -using namespace PtoTestCommon; - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, \ - (int)_ret, __FILE__, __LINE__); \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVci_kernel_2d(int8_t *v1, int8_t *v2, void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(int8_t); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(int8_t); - int8_t *v1Host = nullptr; - int8_t *v1Device = nullptr; - int8_t *v2Host = nullptr; - int8_t *v2Device = nullptr; - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) - deviceId = std::atoi(envDevice); - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, - ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVci_kernel_2d(v1Device, v2Device, stream); - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v2Host, fileSize_v2, v2Device, fileSize_v2, - ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v2.bin", v2Host, fileSize_v2); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - if (stream != nullptr) - aclrtDestroyStream(stream); - if (deviceSet) - aclrtResetDevice(deviceId); - if (aclInited) - aclFinalize(); - return rc; -} diff --git a/test/vpto/cases/micro-op/dsa-sfu/vci/kernel.pto b/test/vpto/cases/micro-op/dsa-sfu/vci/kernel.pto index 3c4c15889..bc41e4701 100644 --- a/test/vpto/cases/micro-op/dsa-sfu/vci/kernel.pto +++ b/test/vpto/cases/micro-op/dsa-sfu/vci/kernel.pto @@ -1,32 +1,91 @@ module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vci_kernel_2d(%arg0: !pto.ptr, - %arg1: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c64 = arith.constant 64 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c32_i64 = arith.constant 32 : i64 - %c128_i64 = arith.constant 128 : i64 - %c1024_i32 = arith.constant 1024 : i32 - - %ub_out = pto.castptr %c0_i64 : i64 -> !pto.ptr + func.func @vci_deep_merged_kernel(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: !pto.ptr, %arg4: !pto.ptr, %arg5: !pto.ptr) attributes {pto.kernel} { + %__deep_merge_guard = arith.constant false + // inactive merged from vci_kernel_2d_vci_f16 + scf.if %__deep_merge_guard { + + %c0_m0 = arith.constant 0 : index + %c0_i64_m0 = arith.constant 0 : i64 + %c2_i64_m0 = arith.constant 2 : i64 + %c128_i64_m0 = arith.constant 128 : i64 + %cst_m0 = arith.constant 0.0 : f16 + + %ub_out_m0 = pto.castptr %c0_i64_m0 : i64 -> !pto.ptr + + pto.vecscope { + %mask_m0 = pto.pset_b16 "PAT_ALL" : !pto.mask + %indices_m0 = pto.vci %cst_m0 {order = "ASC"} : f16 -> !pto.vreg<128xf16> + pto.vsts %indices_m0, %ub_out_m0[%c0_m0], %mask_m0 : !pto.vreg<128xf16>, !pto.ptr, !pto.mask + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m0, %arg1, %c128_i64_m0 + nburst(%c2_i64_m0, %c128_i64_m0, %c128_i64_m0) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + + } + // inactive merged from vci_kernel_2d_vci_si8 + scf.if %__deep_merge_guard { + + %c0_m1 = arith.constant 0 : index + %c256_m1 = arith.constant 256 : index + %c1024_m1 = arith.constant 1024 : index + %c0_i64_m1 = arith.constant 0 : i64 + %c8_i64_m1 = arith.constant 8 : i64 + %c128_i64_m1 = arith.constant 128 : i64 + %c1024_i32_m1 = arith.constant 1024 : i32 + + %ub_out_m1 = pto.castptr %c0_i64_m1 : i64 -> !pto.ptr pto.vecscope { - %_:1 = scf.for %offset = %c0 to %c1024 step %c64 iter_args(%remaining = %c1024_i32) -> (i32) { - %mask, %next_remaining = pto.plt_b32 %remaining : i32 -> !pto.mask, i32 - %base = arith.index_castui %offset : index to i32 - %indices = pto.vci %base {order = "ASC"} : i32 -> !pto.vreg<64xsi32> - pto.vsts %indices, %ub_out[%offset], %mask : !pto.vreg<64xsi32>, !pto.ptr, !pto.mask - scf.yield %next_remaining : i32 + %__m1:1 = scf.for %offset_m1 = %c0_m1 to %c1024_m1 step %c256_m1 iter_args(%remaining_m1 = %c1024_i32_m1) -> (i32) { + %mask_m1, %next_remaining_m1 = pto.plt_b8 %remaining_m1 : i32 -> !pto.mask, i32 + %base_m1 = arith.index_castui %offset_m1 : index to i8 + %indices_m1 = pto.vci %base_m1 {order = "ASC"} : i8 -> !pto.vreg<256xsi8> + pto.vsts %indices_m1, %ub_out_m1[%offset_m1], %mask_m1 : !pto.vreg<256xsi8>, !pto.ptr, !pto.mask + scf.yield %next_remaining_m1 : i32 } } pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg1, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) + pto.mte_ub_gm %ub_out_m1, %arg3, %c128_i64_m1 + nburst(%c8_i64_m1, %c128_i64_m1, %c128_i64_m1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + + } + // active merged from vci_kernel_2d + + %c0_m2 = arith.constant 0 : index + %c64_m2 = arith.constant 64 : index + %c1024_m2 = arith.constant 1024 : index + %c0_i64_m2 = arith.constant 0 : i64 + %c32_i64_m2 = arith.constant 32 : i64 + %c128_i64_m2 = arith.constant 128 : i64 + %c1024_i32_m2 = arith.constant 1024 : i32 + + %ub_out_m2 = pto.castptr %c0_i64_m2 : i64 -> !pto.ptr + + pto.vecscope { + %__m2:1 = scf.for %offset_m2 = %c0_m2 to %c1024_m2 step %c64_m2 iter_args(%remaining_m2 = %c1024_i32_m2) -> (i32) { + %mask_m2, %next_remaining_m2 = pto.plt_b32 %remaining_m2 : i32 -> !pto.mask, i32 + %base_m2 = arith.index_castui %offset_m2 : index to i32 + %indices_m2 = pto.vci %base_m2 {order = "ASC"} : i32 -> !pto.vreg<64xsi32> + pto.vsts %indices_m2, %ub_out_m2[%offset_m2], %mask_m2 : !pto.vreg<64xsi32>, !pto.ptr, !pto.mask + scf.yield %next_remaining_m2 : i32 + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m2, %arg5, %c128_i64_m2 + nburst(%c32_i64_m2, %c128_i64_m2, %c128_i64_m2) : !pto.ptr, !pto.ptr, i64, i64, i64, i64 pto.barrier #pto.pipe + return } } diff --git a/test/vpto/cases/micro-op/dsa-sfu/vci/launch.cpp b/test/vpto/cases/micro-op/dsa-sfu/vci/launch.cpp index 0ce203973..976568c69 100644 --- a/test/vpto/cases/micro-op/dsa-sfu/vci/launch.cpp +++ b/test/vpto/cases/micro-op/dsa-sfu/vci/launch.cpp @@ -5,11 +5,9 @@ // THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, // INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. // See LICENSE in the root of the software repository for the full text of the License. - #ifndef __VEC_SCOPE__ #define __VEC_SCOPE__ #endif - #if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) typedef struct { unsigned char v; } hifloat8_t; typedef struct { unsigned char v; } float8_e4m3_t; @@ -22,23 +20,27 @@ typedef struct { unsigned char v; } float4_e2m1x2_t; #if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) #include #endif - #if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; +struct MrgSortExecutedNumList { uint16_t mrgSortList0,mrgSortList1,mrgSortList2,mrgSortList3; }; #endif #ifndef __CPU_SIM #include "acl/acl.h" #endif -extern "C" __global__ [aicore] void vci_kernel_2d(__gm__ int32_t *v1, - __gm__ int32_t *v2); +extern "C" __global__ [aicore] void vci_deep_merged_kernel( + __gm__ half * arg0, + __gm__ half * arg1, + __gm__ int8_t * arg2, + __gm__ int8_t * arg3, + __gm__ int32_t * arg4, + __gm__ int32_t * arg5); -void LaunchVci_kernel_2d(int32_t *v1, int32_t *v2, void *stream) { - vci_kernel_2d<<<1, nullptr, stream>>>((__gm__ int32_t *)v1, - (__gm__ int32_t *)v2); +void LaunchVciDeepMerged(int32_t * p0, int32_t * p1, void *stream) { + vci_deep_merged_kernel<<<1, nullptr, stream>>>( + (__gm__ half *)p0, + (__gm__ half *)p0, + (__gm__ int8_t *)p0, + (__gm__ int8_t *)p0, + (__gm__ int32_t *)p0, + (__gm__ int32_t *)p1); } diff --git a/test/vpto/cases/micro-op/dsa-sfu/vci/main.cpp b/test/vpto/cases/micro-op/dsa-sfu/vci/main.cpp index 0baf928bd..cb78fac95 100644 --- a/test/vpto/cases/micro-op/dsa-sfu/vci/main.cpp +++ b/test/vpto/cases/micro-op/dsa-sfu/vci/main.cpp @@ -25,8 +25,8 @@ using namespace PtoTestCommon; } \ } while (0) -void LaunchVci_kernel_2d(int32_t *v1, int32_t *v2, void *stream); +void LaunchVciDeepMerged(int32_t * p0, int32_t * p1, void *stream); int main() { size_t elemCount_v1 = 1024; size_t fileSize_v1 = elemCount_v1 * sizeof(int32_t); @@ -59,7 +59,7 @@ int main() { ACL_MEMCPY_HOST_TO_DEVICE)); ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVci_kernel_2d(v1Device, v2Device, stream); + LaunchVciDeepMerged(v1Device, v2Device, stream); ACL_CHECK(aclrtSynchronizeStream(stream)); ACL_CHECK(aclrtMemcpy(v2Host, fileSize_v2, v2Device, fileSize_v2, ACL_MEMCPY_DEVICE_TO_HOST)); diff --git a/test/vpto/cases/micro-op/dsa-sfu/vexpdiff-boundary/compare.py b/test/vpto/cases/micro-op/dsa-sfu/vexpdiff-boundary/compare.py deleted file mode 100755 index 5353e5df9..000000000 --- a/test/vpto/cases/micro-op/dsa-sfu/vexpdiff-boundary/compare.py +++ /dev/null @@ -1,42 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/dsa-sfu/vexpdif-boundary -# family: dsa-sfu -# target_ops: pto.vexpdif -# scenarios: core-f32, fused-expdiff, exceptional-values, floating-overflow-underflow -# NOTE: bulk-generated coverage skeleton. - -import os -import sys -import numpy as np - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(golden_path) or not os.path.exists(output_path): - return False - golden = np.fromfile(golden_path, dtype=dtype) - output = np.fromfile(output_path, dtype=dtype) - return golden.shape == output.shape and np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True) - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_bin("golden_v3.bin", "v3.bin", np.float32, 1e-4) - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/dsa-sfu/vexpdiff-boundary/golden.py b/test/vpto/cases/micro-op/dsa-sfu/vexpdiff-boundary/golden.py deleted file mode 100755 index b4b417320..000000000 --- a/test/vpto/cases/micro-op/dsa-sfu/vexpdiff-boundary/golden.py +++ /dev/null @@ -1,65 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/dsa-sfu/vexpdif-boundary -# family: dsa-sfu -# target_ops: pto.vexpdif -# scenarios: core-f32, fused-expdiff, exceptional-values, floating-overflow-underflow -# NOTE: bulk-generated coverage skeleton. -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -ROWS = 32 -COLS = 32 -SEED = 19 -def generate(output_dir: Path, seed: int) -> None: - del seed - src_pattern = np.array( - [ - 0.0, 88.0, -120.0, np.nan, np.inf, -np.inf, 1.0, -1.0, - 90.0, -90.0, 50.0, -50.0, 3.0, -3.0, 10.0, -10.0, - ], - dtype=np.float32, - ) - max_pattern = np.array( - [ - 0.0, 0.0, 0.0, 1.0, np.inf, -np.inf, -1.0, 1.0, - 0.0, 0.0, 100.0, -100.0, 3.0, -3.0, 20.0, -20.0, - ], - dtype=np.float32, - ) - flat_src = np.resize(src_pattern, ROWS * COLS).astype(np.float32, copy=False) - flat_max = np.resize(max_pattern, ROWS * COLS).astype(np.float32, copy=False) - v1 = flat_src.reshape(ROWS, COLS) - v2 = flat_max.reshape(ROWS, COLS) - v3 = np.zeros((ROWS, COLS), dtype=np.float32) - golden_v3 = np.exp(flat_src - flat_max).astype(np.float32, copy=False).reshape(ROWS, COLS) - - output_dir.mkdir(parents=True, exist_ok=True) - v1.reshape(-1).tofile(output_dir / "v1.bin") - v2.reshape(-1).tofile(output_dir / "v2.bin") - v3.reshape(-1).tofile(output_dir / "v3.bin") - golden_v3.reshape(-1).tofile(output_dir / "golden_v3.bin") - - -def main() -> None: - parser = argparse.ArgumentParser() - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/dsa-sfu/vexpdiff-boundary/kernel.pto b/test/vpto/cases/micro-op/dsa-sfu/vexpdiff-boundary/kernel.pto deleted file mode 100644 index ab3ed968a..000000000 --- a/test/vpto/cases/micro-op/dsa-sfu/vexpdiff-boundary/kernel.pto +++ /dev/null @@ -1,56 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/dsa-sfu/vexpdif-boundary -// family: dsa-sfu -// target_ops: pto.vexpdif -// scenarios: core-f32, fused-expdiff, exceptional-values, floating-overflow-underflow -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vexpdif_boundary_kernel_2d(%arg0: !pto.ptr, - %arg1: !pto.ptr, - %arg2: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c64 = arith.constant 64 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c128_i64 = arith.constant 128 : i64 - %c4096_i64 = arith.constant 4096 : i64 - %c8192_i64 = arith.constant 8192 : i64 - - %ub_in = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_max = pto.castptr %c4096_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c8192_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_in, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - pto.mte_gm_ub %arg1, %ub_max, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %mask = pto.pset_b32 "PAT_ALL" : !pto.mask - scf.for %offset = %c0 to %c1024 step %c64 { - %vec = pto.vlds %ub_in[%offset] : !pto.ptr -> !pto.vreg<64xf32> - %max = pto.vlds %ub_max[%offset] : !pto.ptr -> !pto.vreg<64xf32> - %sum = pto.vexpdif %vec, %max, %mask, "ODD" : !pto.vreg<64xf32>, !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> - pto.vsts %sum, %ub_out[%offset], %mask : !pto.vreg<64xf32>, !pto.ptr, !pto.mask - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg2, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/dsa-sfu/vexpdiff-boundary/launch.cpp b/test/vpto/cases/micro-op/dsa-sfu/vexpdiff-boundary/launch.cpp deleted file mode 100644 index e2f5057e6..000000000 --- a/test/vpto/cases/micro-op/dsa-sfu/vexpdiff-boundary/launch.cpp +++ /dev/null @@ -1,54 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/dsa-sfu/vexpdif-boundary -// family: dsa-sfu -// target_ops: pto.vexpdif -// scenarios: core-f32, fused-expdiff, exceptional-values, floating-overflow-underflow -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vexpdif_boundary_kernel_2d(__gm__ float *v1, - __gm__ float *v2, - __gm__ float *v3); - -void LaunchVexpdiff_boundary_kernel_2d(float *v1, float *v2, float *v3, void *stream) { - vexpdif_boundary_kernel_2d<<<1, nullptr, stream>>>((__gm__ float *)v1, - (__gm__ float *)v2, - (__gm__ float *)v3); -} diff --git a/test/vpto/cases/micro-op/dsa-sfu/vexpdiff-boundary/main.cpp b/test/vpto/cases/micro-op/dsa-sfu/vexpdiff-boundary/main.cpp deleted file mode 100644 index 3f29604cb..000000000 --- a/test/vpto/cases/micro-op/dsa-sfu/vexpdiff-boundary/main.cpp +++ /dev/null @@ -1,102 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/dsa-sfu/vexpdif-boundary -// family: dsa-sfu -// target_ops: pto.vexpdif -// scenarios: core-f32, fused-expdiff, exceptional-values, floating-overflow-underflow -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include - -using namespace PtoTestCommon; - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, \ - (int)_ret, __FILE__, __LINE__); \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVexpdiff_boundary_kernel_2d(float *v1, float *v2, float *v3, void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(float); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(float); - size_t elemCount_v3 = 1024; - size_t fileSize_v3 = elemCount_v3 * sizeof(float); - float *v1Host = nullptr; - float *v1Device = nullptr; - float *v2Host = nullptr; - float *v2Device = nullptr; - float *v3Host = nullptr; - float *v3Device = nullptr; - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) - deviceId = std::atoi(envDevice); - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMallocHost((void **)(&v3Host), fileSize_v3)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v3Device, fileSize_v3, ACL_MEM_MALLOC_HUGE_FIRST)); - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ReadFile("./v3.bin", fileSize_v3, v3Host, fileSize_v3); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v3Device, fileSize_v3, v3Host, fileSize_v3, - ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVexpdiff_boundary_kernel_2d(v1Device, v2Device, v3Device, stream); - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v3Host, fileSize_v3, v3Device, fileSize_v3, - ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v3.bin", v3Host, fileSize_v3); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFree(v3Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - aclrtFreeHost(v3Host); - if (stream != nullptr) - aclrtDestroyStream(stream); - if (deviceSet) - aclrtResetDevice(deviceId); - if (aclInited) - aclFinalize(); - return rc; -} diff --git a/test/vpto/cases/micro-op/dsa-sfu/vexpdiff-f16-part/compare.py b/test/vpto/cases/micro-op/dsa-sfu/vexpdiff-f16-part/compare.py deleted file mode 100644 index 8ca6af6cf..000000000 --- a/test/vpto/cases/micro-op/dsa-sfu/vexpdiff-f16-part/compare.py +++ /dev/null @@ -1,44 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/dsa-sfu/vexpdif-f16-part -# family: dsa-sfu -# target_ops: pto.vexpdif -# scenarios: core-f16, fused-expdiff, part-even-odd - -import os -import sys - -import numpy as np - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(golden_path) or not os.path.exists(output_path): - return False - golden = np.fromfile(golden_path, dtype=dtype) - output = np.fromfile(output_path, dtype=dtype) - return golden.shape == output.shape and np.allclose( - golden, output, atol=eps, rtol=eps, equal_nan=True - ) - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_bin("golden_v3.bin", "v3.bin", np.float32, 1e-4) - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/dsa-sfu/vexpdiff-f16-part/golden.py b/test/vpto/cases/micro-op/dsa-sfu/vexpdiff-f16-part/golden.py deleted file mode 100644 index 1d493c5bb..000000000 --- a/test/vpto/cases/micro-op/dsa-sfu/vexpdiff-f16-part/golden.py +++ /dev/null @@ -1,61 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/dsa-sfu/vexpdif-f16-part -# family: dsa-sfu -# target_ops: pto.vexpdif -# scenarios: core-f16, fused-expdiff, part-even-odd - -import argparse -from pathlib import Path - -import numpy as np - - -ROWS = 32 -COLS = 32 -SEED = 31 - - -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) - v1 = rng.uniform(-4.0, 4.0, size=(ROWS, COLS)).astype(np.float16) - v2 = rng.uniform(-2.0, 2.0, size=(ROWS, COLS)).astype(np.float16) - v3 = np.zeros((ROWS, COLS), dtype=np.float32) - - flat1 = v1.reshape(-1) - flat2 = v2.reshape(-1) - golden = np.empty((ROWS * COLS,), dtype=np.float32) - for base in range(0, ROWS * COLS, 128): - chunk1 = flat1[base : base + 128].astype(np.float32) - chunk2 = flat2[base : base + 128].astype(np.float32) - golden[base : base + 64] = np.exp(chunk1[0::2] - chunk2[0::2]).astype( - np.float32 - ) - golden[base + 64 : base + 128] = np.exp( - chunk1[1::2] - chunk2[1::2] - ).astype(np.float32) - - output_dir.mkdir(parents=True, exist_ok=True) - flat1.tofile(output_dir / "v1.bin") - flat2.tofile(output_dir / "v2.bin") - v3.reshape(-1).tofile(output_dir / "v3.bin") - golden.tofile(output_dir / "golden_v3.bin") - - -def main() -> None: - parser = argparse.ArgumentParser() - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/dsa-sfu/vexpdiff-f16-part/kernel.pto b/test/vpto/cases/micro-op/dsa-sfu/vexpdiff-f16-part/kernel.pto deleted file mode 100644 index 363956318..000000000 --- a/test/vpto/cases/micro-op/dsa-sfu/vexpdiff-f16-part/kernel.pto +++ /dev/null @@ -1,64 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/dsa-sfu/vexpdif-f16-part -// family: dsa-sfu -// target_ops: pto.vexpdif -// scenarios: core-f16, fused-expdiff, part-even-odd -// NOTE: validates that ODD/EVEN selects odd/even lanes from f16 inputs. -// ----------------------------------------------------------------------------- -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vexpdif_f16_part_kernel_2d(%arg0: !pto.ptr, - %arg1: !pto.ptr, - %arg2: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c64 = arith.constant 64 : index - %c128 = arith.constant 128 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c64_i64 = arith.constant 64 : i64 - %c128_i64 = arith.constant 128 : i64 - %c2048_i64 = arith.constant 2048 : i64 - %c4096_i64 = arith.constant 4096 : i64 - %c1024_i32 = arith.constant 1024 : i32 - - %ub_in = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_max = pto.castptr %c2048_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c4096_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_in, %c0_i64, %c64_i64 - nburst(%c32_i64, %c64_i64, %c64_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - pto.mte_gm_ub %arg1, %ub_max, %c0_i64, %c64_i64 - nburst(%c32_i64, %c64_i64, %c64_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %full_mask = pto.pset_b16 "PAT_ALL" : !pto.mask - %_:1 = scf.for %offset = %c0 to %c1024 step %c128 iter_args(%remaining = %c1024_i32) -> (i32) { - %input = pto.vlds %ub_in[%offset] : !pto.ptr -> !pto.vreg<128xf16> - %max = pto.vlds %ub_max[%offset] : !pto.ptr -> !pto.vreg<128xf16> - %even_mask, %remaining_after_even = pto.plt_b32 %remaining : i32 -> !pto.mask, i32 - %odd_mask, %next_remaining = pto.plt_b32 %remaining_after_even : i32 -> !pto.mask, i32 - %even = pto.vexpdif %input, %max, %full_mask, "EVEN" : !pto.vreg<128xf16>, !pto.vreg<128xf16>, !pto.mask -> !pto.vreg<64xf32> - %odd = pto.vexpdif %input, %max, %full_mask, "ODD" : !pto.vreg<128xf16>, !pto.vreg<128xf16>, !pto.mask -> !pto.vreg<64xf32> - %odd_offset = arith.addi %offset, %c64 : index - pto.vsts %even, %ub_out[%offset], %even_mask : !pto.vreg<64xf32>, !pto.ptr, !pto.mask - pto.vsts %odd, %ub_out[%odd_offset], %odd_mask : !pto.vreg<64xf32>, !pto.ptr, !pto.mask - scf.yield %next_remaining : i32 - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg2, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/dsa-sfu/vexpdiff-f16-part/launch.cpp b/test/vpto/cases/micro-op/dsa-sfu/vexpdiff-f16-part/launch.cpp deleted file mode 100644 index 78f8bef63..000000000 --- a/test/vpto/cases/micro-op/dsa-sfu/vexpdiff-f16-part/launch.cpp +++ /dev/null @@ -1,53 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/dsa-sfu/vexpdif-f16-part -// family: dsa-sfu -// target_ops: pto.vexpdif -// scenarios: core-f16, fused-expdiff, part-even-odd -// ----------------------------------------------------------------------------- -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vexpdif_f16_part_kernel_2d(__gm__ half *v1, - __gm__ half *v2, - __gm__ float *v3); - -void LaunchVexpdiff_f16_part_kernel_2d(uint16_t *v1, uint16_t *v2, float *v3, - void *stream) { - vexpdif_f16_part_kernel_2d<<<1, nullptr, stream>>>((__gm__ half *)v1, - (__gm__ half *)v2, - (__gm__ float *)v3); -} diff --git a/test/vpto/cases/micro-op/dsa-sfu/vexpdiff-f16-part/main.cpp b/test/vpto/cases/micro-op/dsa-sfu/vexpdiff-f16-part/main.cpp deleted file mode 100644 index 58b1f6c5d..000000000 --- a/test/vpto/cases/micro-op/dsa-sfu/vexpdiff-f16-part/main.cpp +++ /dev/null @@ -1,101 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/dsa-sfu/vexpdif-f16-part -// family: dsa-sfu -// target_ops: pto.vexpdif -// scenarios: core-f16, fused-expdiff, part-even-odd -// ----------------------------------------------------------------------------- -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include - -using namespace PtoTestCommon; - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, \ - (int)_ret, __FILE__, __LINE__); \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVexpdiff_f16_part_kernel_2d(uint16_t *v1, uint16_t *v2, float *v3, - void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(uint16_t); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(uint16_t); - size_t elemCount_v3 = 1024; - size_t fileSize_v3 = elemCount_v3 * sizeof(float); - uint16_t *v1Host = nullptr; - uint16_t *v1Device = nullptr; - uint16_t *v2Host = nullptr; - uint16_t *v2Device = nullptr; - float *v3Host = nullptr; - float *v3Device = nullptr; - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) - deviceId = std::atoi(envDevice); - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMallocHost((void **)(&v3Host), fileSize_v3)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v3Device, fileSize_v3, ACL_MEM_MALLOC_HUGE_FIRST)); - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ReadFile("./v3.bin", fileSize_v3, v3Host, fileSize_v3); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v3Device, fileSize_v3, v3Host, fileSize_v3, - ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVexpdiff_f16_part_kernel_2d(v1Device, v2Device, v3Device, stream); - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v3Host, fileSize_v3, v3Device, fileSize_v3, - ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v3.bin", v3Host, fileSize_v3); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFree(v3Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - aclrtFreeHost(v3Host); - if (stream != nullptr) - aclrtDestroyStream(stream); - if (deviceSet) - aclrtResetDevice(deviceId); - if (aclInited) - aclFinalize(); - return rc; -} diff --git a/test/vpto/cases/micro-op/dsa-sfu/vexpdiff-f32/kernel.pto b/test/vpto/cases/micro-op/dsa-sfu/vexpdiff-f32/kernel.pto index d3913956c..076ada963 100644 --- a/test/vpto/cases/micro-op/dsa-sfu/vexpdiff-f32/kernel.pto +++ b/test/vpto/cases/micro-op/dsa-sfu/vexpdiff-f32/kernel.pto @@ -1,51 +1,147 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/dsa-sfu/vexpdif-f32 -// family: dsa-sfu -// target_ops: pto.vexpdif -// scenarios: core-f32, fused-expdiff -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vexpdif_kernel_2d(%arg0: !pto.ptr, - %arg1: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c64 = arith.constant 64 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c128_i64 = arith.constant 128 : i64 - %c4096_i64 = arith.constant 4096 : i64 - %c1024_i32 = arith.constant 1024 : i32 - - %ub_in = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c4096_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_in, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) + func.func @vexpdiff_f32_deep_merged_kernel(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: !pto.ptr, %arg4: !pto.ptr, %arg5: !pto.ptr, %arg6: !pto.ptr, %arg7: !pto.ptr) attributes {pto.kernel} { + %__deep_merge_guard = arith.constant false + // inactive merged from vexpdif_f16_part_kernel_2d + scf.if %__deep_merge_guard { + + %c0_m0 = arith.constant 0 : index + %c64_m0 = arith.constant 64 : index + %c128_m0 = arith.constant 128 : index + %c1024_m0 = arith.constant 1024 : index + %c0_i64_m0 = arith.constant 0 : i64 + %c1_i64_m0 = arith.constant 1 : i64 + %c32_i64_m0 = arith.constant 32 : i64 + %c64_i64_m0 = arith.constant 64 : i64 + %c128_i64_m0 = arith.constant 128 : i64 + %c2048_i64_m0 = arith.constant 2048 : i64 + %c4096_i64_m0 = arith.constant 4096 : i64 + %c1024_i32_m0 = arith.constant 1024 : i32 + + %ub_in_m0 = pto.castptr %c0_i64_m0 : i64 -> !pto.ptr + %ub_max_m0 = pto.castptr %c2048_i64_m0 : i64 -> !pto.ptr + %ub_out_m0 = pto.castptr %c4096_i64_m0 : i64 -> !pto.ptr + + %false_m0 = arith.constant false + pto.mte_gm_ub %arg0, %ub_in_m0, %c0_i64_m0, %c64_i64_m0 + nburst(%c32_i64_m0, %c64_i64_m0, %c64_i64_m0) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + pto.mte_gm_ub %arg1, %ub_max_m0, %c0_i64_m0, %c64_i64_m0 + nburst(%c32_i64_m0, %c64_i64_m0, %c64_i64_m0) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %full_mask_m0 = pto.pset_b16 "PAT_ALL" : !pto.mask + %__m0:1 = scf.for %offset_m0 = %c0_m0 to %c1024_m0 step %c128_m0 iter_args(%remaining_m0 = %c1024_i32_m0) -> (i32) { + %input_m0 = pto.vlds %ub_in_m0[%offset_m0] : !pto.ptr -> !pto.vreg<128xf16> + %max_m0 = pto.vlds %ub_max_m0[%offset_m0] : !pto.ptr -> !pto.vreg<128xf16> + %even_mask_m0, %remaining_after_even_m0 = pto.plt_b32 %remaining_m0 : i32 -> !pto.mask, i32 + %odd_mask_m0, %next_remaining_m0 = pto.plt_b32 %remaining_after_even_m0 : i32 -> !pto.mask, i32 + %even_m0 = pto.vexpdif %input_m0, %max_m0, %full_mask_m0, "EVEN" : !pto.vreg<128xf16>, !pto.vreg<128xf16>, !pto.mask -> !pto.vreg<64xf32> + %odd_m0 = pto.vexpdif %input_m0, %max_m0, %full_mask_m0, "ODD" : !pto.vreg<128xf16>, !pto.vreg<128xf16>, !pto.mask -> !pto.vreg<64xf32> + %odd_offset_m0 = arith.addi %offset_m0, %c64_m0 : index + pto.vsts %even_m0, %ub_out_m0[%offset_m0], %even_mask_m0 : !pto.vreg<64xf32>, !pto.ptr, !pto.mask + pto.vsts %odd_m0, %ub_out_m0[%odd_offset_m0], %odd_mask_m0 : !pto.vreg<64xf32>, !pto.ptr, !pto.mask + scf.yield %next_remaining_m0 : i32 + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m0, %arg2, %c128_i64_m0 + nburst(%c32_i64_m0, %c128_i64_m0, %c128_i64_m0) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + + } + // inactive merged from vexpdif_boundary_kernel_2d + scf.if %__deep_merge_guard { + + %c0_m1 = arith.constant 0 : index + %c64_m1 = arith.constant 64 : index + %c1024_m1 = arith.constant 1024 : index + %c0_i64_m1 = arith.constant 0 : i64 + %c1_i64_m1 = arith.constant 1 : i64 + %c32_i64_m1 = arith.constant 32 : i64 + %c128_i64_m1 = arith.constant 128 : i64 + %c4096_i64_m1 = arith.constant 4096 : i64 + %c8192_i64_m1 = arith.constant 8192 : i64 + + %ub_in_m1 = pto.castptr %c0_i64_m1 : i64 -> !pto.ptr + %ub_max_m1 = pto.castptr %c4096_i64_m1 : i64 -> !pto.ptr + %ub_out_m1 = pto.castptr %c8192_i64_m1 : i64 -> !pto.ptr + + %false_m1 = arith.constant false + pto.mte_gm_ub %arg3, %ub_in_m1, %c0_i64_m1, %c128_i64_m1 + nburst(%c32_i64_m1, %c128_i64_m1, %c128_i64_m1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + pto.mte_gm_ub %arg4, %ub_max_m1, %c0_i64_m1, %c128_i64_m1 + nburst(%c32_i64_m1, %c128_i64_m1, %c128_i64_m1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %mask_m1 = pto.pset_b32 "PAT_ALL" : !pto.mask + scf.for %offset_m1 = %c0_m1 to %c1024_m1 step %c64_m1 { + %vec_m1 = pto.vlds %ub_in_m1[%offset_m1] : !pto.ptr -> !pto.vreg<64xf32> + %max_m1 = pto.vlds %ub_max_m1[%offset_m1] : !pto.ptr -> !pto.vreg<64xf32> + %sum_m1 = pto.vexpdif %vec_m1, %max_m1, %mask_m1, "ODD" : !pto.vreg<64xf32>, !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> + pto.vsts %sum_m1, %ub_out_m1[%offset_m1], %mask_m1 : !pto.vreg<64xf32>, !pto.ptr, !pto.mask + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m1, %arg5, %c128_i64_m1 + nburst(%c32_i64_m1, %c128_i64_m1, %c128_i64_m1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + + } + // active merged from vexpdif_kernel_2d + + %c0_m2 = arith.constant 0 : index + %c64_m2 = arith.constant 64 : index + %c1024_m2 = arith.constant 1024 : index + %c0_i64_m2 = arith.constant 0 : i64 + %c1_i64_m2 = arith.constant 1 : i64 + %c32_i64_m2 = arith.constant 32 : i64 + %c128_i64_m2 = arith.constant 128 : i64 + %c4096_i64_m2 = arith.constant 4096 : i64 + %c1024_i32_m2 = arith.constant 1024 : i32 + + %ub_in_m2 = pto.castptr %c0_i64_m2 : i64 -> !pto.ptr + %ub_out_m2 = pto.castptr %c4096_i64_m2 : i64 -> !pto.ptr + + %false_m2 = arith.constant false + pto.mte_gm_ub %arg6, %ub_in_m2, %c0_i64_m2, %c128_i64_m2 + nburst(%c32_i64_m2, %c128_i64_m2, %c128_i64_m2) : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] pto.vecscope { - %_:1 = scf.for %offset = %c0 to %c1024 step %c64 iter_args(%remaining = %c1024_i32) -> (i32) { - %mask, %next_remaining = pto.plt_b32 %remaining : i32 -> !pto.mask, i32 - %vec = pto.vlds %ub_in[%offset] : !pto.ptr -> !pto.vreg<64xf32> - %sum = pto.vexpdif %vec, %vec, %mask, "ODD" : !pto.vreg<64xf32>, !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> - pto.vsts %sum, %ub_out[%offset], %mask : !pto.vreg<64xf32>, !pto.ptr, !pto.mask - scf.yield %next_remaining : i32 + %__m2:1 = scf.for %offset_m2 = %c0_m2 to %c1024_m2 step %c64_m2 iter_args(%remaining_m2 = %c1024_i32_m2) -> (i32) { + %mask_m2, %next_remaining_m2 = pto.plt_b32 %remaining_m2 : i32 -> !pto.mask, i32 + %vec_m2 = pto.vlds %ub_in_m2[%offset_m2] : !pto.ptr -> !pto.vreg<64xf32> + %sum_m2 = pto.vexpdif %vec_m2, %vec_m2, %mask_m2, "ODD" : !pto.vreg<64xf32>, !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> + pto.vsts %sum_m2, %ub_out_m2[%offset_m2], %mask_m2 : !pto.vreg<64xf32>, !pto.ptr, !pto.mask + scf.yield %next_remaining_m2 : i32 } } pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg1, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) + pto.mte_ub_gm %ub_out_m2, %arg7, %c128_i64_m2 + nburst(%c32_i64_m2, %c128_i64_m2, %c128_i64_m2) : !pto.ptr, !pto.ptr, i64, i64, i64, i64 pto.barrier #pto.pipe + return } } diff --git a/test/vpto/cases/micro-op/dsa-sfu/vexpdiff-f32/launch.cpp b/test/vpto/cases/micro-op/dsa-sfu/vexpdiff-f32/launch.cpp index 00ada867d..0a11d5ce6 100644 --- a/test/vpto/cases/micro-op/dsa-sfu/vexpdiff-f32/launch.cpp +++ b/test/vpto/cases/micro-op/dsa-sfu/vexpdiff-f32/launch.cpp @@ -5,19 +5,9 @@ // THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, // INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. // See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/dsa-sfu/vexpdif-f32 -// family: dsa-sfu -// target_ops: pto.vexpdif -// scenarios: core-f32, fused-expdiff -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- #ifndef __VEC_SCOPE__ #define __VEC_SCOPE__ #endif - #if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) typedef struct { unsigned char v; } hifloat8_t; typedef struct { unsigned char v; } float8_e4m3_t; @@ -30,23 +20,31 @@ typedef struct { unsigned char v; } float4_e2m1x2_t; #if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) #include #endif - #if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; +struct MrgSortExecutedNumList { uint16_t mrgSortList0,mrgSortList1,mrgSortList2,mrgSortList3; }; #endif #ifndef __CPU_SIM #include "acl/acl.h" #endif -extern "C" __global__ [aicore] void vexpdif_kernel_2d(__gm__ float *v1, - __gm__ float *v2); +extern "C" __global__ [aicore] void vexpdiff_f32_deep_merged_kernel( + __gm__ half * arg0, + __gm__ half * arg1, + __gm__ float * arg2, + __gm__ float * arg3, + __gm__ float * arg4, + __gm__ float * arg5, + __gm__ float * arg6, + __gm__ float * arg7); -void LaunchVexpdiff_kernel_2d(float *v1, float *v2, void *stream) { - vexpdif_kernel_2d<<<1, nullptr, stream>>>((__gm__ float *)v1, - (__gm__ float *)v2); +void LaunchVexpdiffF32DeepMerged(float * p0, float * p1, void *stream) { + vexpdiff_f32_deep_merged_kernel<<<1, nullptr, stream>>>( + (__gm__ half *)p0, + (__gm__ half *)p0, + (__gm__ float *)p0, + (__gm__ float *)p0, + (__gm__ float *)p0, + (__gm__ float *)p0, + (__gm__ float *)p0, + (__gm__ float *)p1); } diff --git a/test/vpto/cases/micro-op/dsa-sfu/vexpdiff-f32/main.cpp b/test/vpto/cases/micro-op/dsa-sfu/vexpdiff-f32/main.cpp index 4afacde3a..5458674b5 100644 --- a/test/vpto/cases/micro-op/dsa-sfu/vexpdiff-f32/main.cpp +++ b/test/vpto/cases/micro-op/dsa-sfu/vexpdiff-f32/main.cpp @@ -36,8 +36,8 @@ using namespace PtoTestCommon; } \ } while (0) -void LaunchVexpdiff_kernel_2d(float *v1, float *v2, void *stream); +void LaunchVexpdiffF32DeepMerged(float * p0, float * p1, void *stream); int main() { size_t elemCount_v1 = 1024; size_t fileSize_v1 = elemCount_v1 * sizeof(float); @@ -70,7 +70,7 @@ int main() { ACL_MEMCPY_HOST_TO_DEVICE)); ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVexpdiff_kernel_2d(v1Device, v2Device, stream); + LaunchVexpdiffF32DeepMerged(v1Device, v2Device, stream); ACL_CHECK(aclrtSynchronizeStream(stream)); ACL_CHECK(aclrtMemcpy(v2Host, fileSize_v2, v2Device, fileSize_v2, ACL_MEMCPY_DEVICE_TO_HOST)); diff --git a/test/vpto/cases/micro-op/dsa-sfu/vlrelu-f16/compare.py b/test/vpto/cases/micro-op/dsa-sfu/vlrelu-f16/compare.py deleted file mode 100755 index 4717fd3e8..000000000 --- a/test/vpto/cases/micro-op/dsa-sfu/vlrelu-f16/compare.py +++ /dev/null @@ -1,42 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/dsa-sfu/vlrelu-f16 -# family: dsa-sfu -# target_ops: pto.vlrelu -# scenarios: core-f16, full-mask, scalar-operand -# NOTE: bulk-generated coverage skeleton. - -import os -import sys -import numpy as np - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(golden_path) or not os.path.exists(output_path): - return False - golden = np.fromfile(golden_path, dtype=dtype) - output = np.fromfile(output_path, dtype=dtype) - return golden.shape == output.shape and np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True) - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_bin("golden_v2.bin", "v2.bin", np.float32, 1e-4) - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/dsa-sfu/vlrelu-f16/golden.py b/test/vpto/cases/micro-op/dsa-sfu/vlrelu-f16/golden.py deleted file mode 100755 index bc7c328b9..000000000 --- a/test/vpto/cases/micro-op/dsa-sfu/vlrelu-f16/golden.py +++ /dev/null @@ -1,50 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/dsa-sfu/vlrelu-f16 -# family: dsa-sfu -# target_ops: pto.vlrelu -# scenarios: core-f16, full-mask, scalar-operand -# NOTE: bulk-generated coverage skeleton. -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -ROWS = 32 -COLS = 32 -SEED = 19 -ALPHA = np.float16(0.125) - - -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) - v1 = rng.uniform(-8.0, 8.0, size=(ROWS, COLS)).astype(np.float16) - v2 = np.zeros((ROWS, COLS), dtype=np.float16) - golden_v2 = np.where(v1 >= 0.0, v1, v1 * ALPHA).astype(np.float16, copy=False) - - output_dir.mkdir(parents=True, exist_ok=True) - v1.reshape(-1).tofile(output_dir / "v1.bin") - v2.reshape(-1).tofile(output_dir / "v2.bin") - golden_v2.reshape(-1).tofile(output_dir / "golden_v2.bin") - - -def main() -> None: - parser = argparse.ArgumentParser() - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/dsa-sfu/vlrelu-f16/kernel.pto b/test/vpto/cases/micro-op/dsa-sfu/vlrelu-f16/kernel.pto deleted file mode 100644 index 249f947b0..000000000 --- a/test/vpto/cases/micro-op/dsa-sfu/vlrelu-f16/kernel.pto +++ /dev/null @@ -1,51 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/dsa-sfu/vlrelu-f16 -// family: dsa-sfu -// target_ops: pto.vlrelu -// scenarios: core-f16, full-mask, scalar-operand -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vec_add_scalar_kernel_2d(%arg0: !pto.ptr, - %arg1: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c128 = arith.constant 128 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c64_i64 = arith.constant 64 : i64 - %c2048_i64 = arith.constant 2048 : i64 - %cst = arith.constant 1.250000e-01 : f16 - - %ub_in = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c2048_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_in, %c0_i64, %c64_i64 - nburst(%c32_i64, %c64_i64, %c64_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %mask = pto.pset_b16 "PAT_ALL" : !pto.mask - scf.for %offset = %c0 to %c1024 step %c128 { - %vec = pto.vlds %ub_in[%offset] : !pto.ptr -> !pto.vreg<128xf16> - %sum = pto.vlrelu %vec, %cst, %mask : !pto.vreg<128xf16>, f16, !pto.mask -> !pto.vreg<128xf16> - pto.vsts %sum, %ub_out[%offset], %mask : !pto.vreg<128xf16>, !pto.ptr, !pto.mask - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg1, %c64_i64 - nburst(%c32_i64, %c64_i64, %c64_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/dsa-sfu/vlrelu-f16/launch.cpp b/test/vpto/cases/micro-op/dsa-sfu/vlrelu-f16/launch.cpp deleted file mode 100644 index da89bb6f0..000000000 --- a/test/vpto/cases/micro-op/dsa-sfu/vlrelu-f16/launch.cpp +++ /dev/null @@ -1,52 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/dsa-sfu/vlrelu-f16 -// family: dsa-sfu -// target_ops: pto.vlrelu -// scenarios: core-f16, full-mask, scalar-operand -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vec_add_scalar_kernel_2d(__gm__ float *v1, - __gm__ float *v2); - -void LaunchVec_add_scalar_kernel_2d(float *v1, float *v2, void *stream) { - vec_add_scalar_kernel_2d<<<1, nullptr, stream>>>((__gm__ float *)v1, - (__gm__ float *)v2); -} diff --git a/test/vpto/cases/micro-op/dsa-sfu/vlrelu-f16/main.cpp b/test/vpto/cases/micro-op/dsa-sfu/vlrelu-f16/main.cpp deleted file mode 100644 index 73e868d99..000000000 --- a/test/vpto/cases/micro-op/dsa-sfu/vlrelu-f16/main.cpp +++ /dev/null @@ -1,91 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/dsa-sfu/vlrelu-f16 -// family: dsa-sfu -// target_ops: pto.vlrelu -// scenarios: core-f16, full-mask, scalar-operand -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include - -using namespace PtoTestCommon; - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, \ - (int)_ret, __FILE__, __LINE__); \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVec_add_scalar_kernel_2d(float *v1, float *v2, void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(float); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(float); - float *v1Host = nullptr; - float *v1Device = nullptr; - float *v2Host = nullptr; - float *v2Device = nullptr; - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) - deviceId = std::atoi(envDevice); - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, - ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVec_add_scalar_kernel_2d(v1Device, v2Device, stream); - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v2Host, fileSize_v2, v2Device, fileSize_v2, - ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v2.bin", v2Host, fileSize_v2); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - if (stream != nullptr) - aclrtDestroyStream(stream); - if (deviceSet) - aclrtResetDevice(deviceId); - if (aclInited) - aclFinalize(); - return rc; -} diff --git a/test/vpto/cases/micro-op/dsa-sfu/vlrelu-f32-exceptional/compare.py b/test/vpto/cases/micro-op/dsa-sfu/vlrelu-f32-exceptional/compare.py deleted file mode 100644 index 15b793fac..000000000 --- a/test/vpto/cases/micro-op/dsa-sfu/vlrelu-f32-exceptional/compare.py +++ /dev/null @@ -1,37 +0,0 @@ -#!/usr/bin/python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - - -import os -import sys -import numpy as np - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(golden_path) or not os.path.exists(output_path): - return False - golden = np.fromfile(golden_path, dtype=dtype) - output = np.fromfile(output_path, dtype=dtype) - return golden.shape == output.shape and np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True) - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_bin("golden_v2.bin", "v2.bin", np.float32, 1e-4) - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/dsa-sfu/vlrelu-f32-exceptional/golden.py b/test/vpto/cases/micro-op/dsa-sfu/vlrelu-f32-exceptional/golden.py deleted file mode 100644 index 938b69b9d..000000000 --- a/test/vpto/cases/micro-op/dsa-sfu/vlrelu-f32-exceptional/golden.py +++ /dev/null @@ -1,49 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -ROWS = 32 -COLS = 32 -SEED = 19 -ALPHA = np.float32(0.125) - - -def generate(output_dir: Path, seed: int) -> None: - del seed - specials = np.array( - [-np.inf, -8.0, -1.0, -0.0, 0.0, 1.0, np.inf, np.nan], - dtype=np.float32, - ) - v1 = np.resize(specials, ROWS * COLS).reshape(ROWS, COLS).astype(np.float32) - v2 = np.zeros((ROWS, COLS), dtype=np.float32) - golden_v2 = np.where(v1 >= 0.0, v1, v1 * ALPHA).astype(np.float32, copy=False) - - output_dir.mkdir(parents=True, exist_ok=True) - v1.reshape(-1).tofile(output_dir / "v1.bin") - v2.reshape(-1).tofile(output_dir / "v2.bin") - golden_v2.reshape(-1).tofile(output_dir / "golden_v2.bin") - - -def main() -> None: - parser = argparse.ArgumentParser() - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/dsa-sfu/vlrelu-f32-exceptional/kernel.pto b/test/vpto/cases/micro-op/dsa-sfu/vlrelu-f32-exceptional/kernel.pto deleted file mode 100644 index 95005cf5a..000000000 --- a/test/vpto/cases/micro-op/dsa-sfu/vlrelu-f32-exceptional/kernel.pto +++ /dev/null @@ -1,45 +0,0 @@ -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vec_add_scalar_kernel_2d(%arg0: !pto.ptr, - %arg1: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c64 = arith.constant 64 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c128_i64 = arith.constant 128 : i64 - %c4096_i64 = arith.constant 4096 : i64 - %c1024_i32 = arith.constant 1024 : i32 - %cst = arith.constant 1.250000e-01 : f32 - - %ub_in = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c4096_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_in, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %_:1 = scf.for %offset = %c0 to %c1024 step %c64 iter_args(%remaining = %c1024_i32) -> (i32) { - %mask, %next_remaining = pto.plt_b32 %remaining : i32 -> !pto.mask, i32 - %vec = pto.vlds %ub_in[%offset] : !pto.ptr -> !pto.vreg<64xf32> - %sum = pto.vlrelu %vec, %cst, %mask : !pto.vreg<64xf32>, f32, !pto.mask -> !pto.vreg<64xf32> - pto.vsts %sum, %ub_out[%offset], %mask : !pto.vreg<64xf32>, !pto.ptr, !pto.mask - scf.yield %next_remaining : i32 - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg1, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/dsa-sfu/vlrelu-f32-exceptional/launch.cpp b/test/vpto/cases/micro-op/dsa-sfu/vlrelu-f32-exceptional/launch.cpp deleted file mode 100644 index 44c07c249..000000000 --- a/test/vpto/cases/micro-op/dsa-sfu/vlrelu-f32-exceptional/launch.cpp +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vec_add_scalar_kernel_2d(__gm__ float *v1, - __gm__ float *v2); - -void LaunchVec_add_scalar_kernel_2d(float *v1, float *v2, void *stream) { - vec_add_scalar_kernel_2d<<<1, nullptr, stream>>>((__gm__ float *)v1, - (__gm__ float *)v2); -} diff --git a/test/vpto/cases/micro-op/dsa-sfu/vlrelu-f32-exceptional/main.cpp b/test/vpto/cases/micro-op/dsa-sfu/vlrelu-f32-exceptional/main.cpp deleted file mode 100644 index fcb42331f..000000000 --- a/test/vpto/cases/micro-op/dsa-sfu/vlrelu-f32-exceptional/main.cpp +++ /dev/null @@ -1,83 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include - -using namespace PtoTestCommon; - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, \ - (int)_ret, __FILE__, __LINE__); \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVec_add_scalar_kernel_2d(float *v1, float *v2, void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(float); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(float); - float *v1Host = nullptr; - float *v1Device = nullptr; - float *v2Host = nullptr; - float *v2Device = nullptr; - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) - deviceId = std::atoi(envDevice); - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, - ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVec_add_scalar_kernel_2d(v1Device, v2Device, stream); - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v2Host, fileSize_v2, v2Device, fileSize_v2, - ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v2.bin", v2Host, fileSize_v2); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - if (stream != nullptr) - aclrtDestroyStream(stream); - if (deviceSet) - aclrtResetDevice(deviceId); - if (aclInited) - aclFinalize(); - return rc; -} diff --git a/test/vpto/cases/micro-op/dsa-sfu/vlrelu-f32/kernel.pto b/test/vpto/cases/micro-op/dsa-sfu/vlrelu-f32/kernel.pto index 95005cf5a..a9e0b72c5 100644 --- a/test/vpto/cases/micro-op/dsa-sfu/vlrelu-f32/kernel.pto +++ b/test/vpto/cases/micro-op/dsa-sfu/vlrelu-f32/kernel.pto @@ -1,45 +1,177 @@ module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vec_add_scalar_kernel_2d(%arg0: !pto.ptr, - %arg1: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c64 = arith.constant 64 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c128_i64 = arith.constant 128 : i64 - %c4096_i64 = arith.constant 4096 : i64 - %c1024_i32 = arith.constant 1024 : i32 - %cst = arith.constant 1.250000e-01 : f32 - - %ub_in = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c4096_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_in, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) + func.func @vlrelu_f32_deep_merged_kernel(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: !pto.ptr, %arg4: !pto.ptr, %arg5: !pto.ptr, %arg6: !pto.ptr, %arg7: !pto.ptr) attributes {pto.kernel} { + %__deep_merge_guard = arith.constant false + // inactive merged from vec_add_scalar_kernel_2d_vlrelu_f32_exceptional + scf.if %__deep_merge_guard { + + %c0_m0 = arith.constant 0 : index + %c1_m0 = arith.constant 1 : index + %c64_m0 = arith.constant 64 : index + %c1024_m0 = arith.constant 1024 : index + %c0_i64_m0 = arith.constant 0 : i64 + %c1_i64_m0 = arith.constant 1 : i64 + %c32_i64_m0 = arith.constant 32 : i64 + %c128_i64_m0 = arith.constant 128 : i64 + %c4096_i64_m0 = arith.constant 4096 : i64 + %c1024_i32_m0 = arith.constant 1024 : i32 + %cst_m0 = arith.constant 1.250000e-01 : f32 + + %ub_in_m0 = pto.castptr %c0_i64_m0 : i64 -> !pto.ptr + %ub_out_m0 = pto.castptr %c4096_i64_m0 : i64 -> !pto.ptr + + %false_m0 = arith.constant false + pto.mte_gm_ub %arg0, %ub_in_m0, %c0_i64_m0, %c128_i64_m0 + nburst(%c32_i64_m0, %c128_i64_m0, %c128_i64_m0) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %__m0:1 = scf.for %offset_m0 = %c0_m0 to %c1024_m0 step %c64_m0 iter_args(%remaining_m0 = %c1024_i32_m0) -> (i32) { + %mask_m0, %next_remaining_m0 = pto.plt_b32 %remaining_m0 : i32 -> !pto.mask, i32 + %vec_m0 = pto.vlds %ub_in_m0[%offset_m0] : !pto.ptr -> !pto.vreg<64xf32> + %sum_m0 = pto.vlrelu %vec_m0, %cst_m0, %mask_m0 : !pto.vreg<64xf32>, f32, !pto.mask -> !pto.vreg<64xf32> + pto.vsts %sum_m0, %ub_out_m0[%offset_m0], %mask_m0 : !pto.vreg<64xf32>, !pto.ptr, !pto.mask + scf.yield %next_remaining_m0 : i32 + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m0, %arg1, %c128_i64_m0 + nburst(%c32_i64_m0, %c128_i64_m0, %c128_i64_m0) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + + } + // inactive merged from vec_add_scalar_kernel_2d_vlrelu_f16 + scf.if %__deep_merge_guard { + + %c0_m1 = arith.constant 0 : index + %c1_m1 = arith.constant 1 : index + %c128_m1 = arith.constant 128 : index + %c1024_m1 = arith.constant 1024 : index + %c0_i64_m1 = arith.constant 0 : i64 + %c1_i64_m1 = arith.constant 1 : i64 + %c32_i64_m1 = arith.constant 32 : i64 + %c64_i64_m1 = arith.constant 64 : i64 + %c2048_i64_m1 = arith.constant 2048 : i64 + %cst_m1 = arith.constant 1.250000e-01 : f16 + + %ub_in_m1 = pto.castptr %c0_i64_m1 : i64 -> !pto.ptr + %ub_out_m1 = pto.castptr %c2048_i64_m1 : i64 -> !pto.ptr + + %false_m1 = arith.constant false + pto.mte_gm_ub %arg2, %ub_in_m1, %c0_i64_m1, %c64_i64_m1 + nburst(%c32_i64_m1, %c64_i64_m1, %c64_i64_m1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %mask_m1 = pto.pset_b16 "PAT_ALL" : !pto.mask + scf.for %offset_m1 = %c0_m1 to %c1024_m1 step %c128_m1 { + %vec_m1 = pto.vlds %ub_in_m1[%offset_m1] : !pto.ptr -> !pto.vreg<128xf16> + %sum_m1 = pto.vlrelu %vec_m1, %cst_m1, %mask_m1 : !pto.vreg<128xf16>, f16, !pto.mask -> !pto.vreg<128xf16> + pto.vsts %sum_m1, %ub_out_m1[%offset_m1], %mask_m1 : !pto.vreg<128xf16>, !pto.ptr, !pto.mask + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m1, %arg3, %c64_i64_m1 + nburst(%c32_i64_m1, %c64_i64_m1, %c64_i64_m1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + + } + // inactive merged from vadds_tail_kernel_2d + scf.if %__deep_merge_guard { + + %c0_m2 = arith.constant 0 : index + %c64_m2 = arith.constant 64 : index + %c1024_m2 = arith.constant 1024 : index + %c0_i64_m2 = arith.constant 0 : i64 + %c1_i64_m2 = arith.constant 1 : i64 + %c32_i64_m2 = arith.constant 32 : i64 + %c128_i64_m2 = arith.constant 128 : i64 + %c4096_i64_m2 = arith.constant 4096 : i64 + %c1000_i32_m2 = arith.constant 1000 : i32 + %cst_m2 = arith.constant 1.250000e-01 : f32 + + %ub_in_m2 = pto.castptr %c0_i64_m2 : i64 -> !pto.ptr + %ub_out_m2 = pto.castptr %c4096_i64_m2 : i64 -> !pto.ptr + + %false_m2 = arith.constant false + pto.mte_gm_ub %arg4, %ub_in_m2, %c0_i64_m2, %c128_i64_m2 + nburst(%c32_i64_m2, %c128_i64_m2, %c128_i64_m2) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %__m2:1 = scf.for %offset_m2 = %c0_m2 to %c1024_m2 step %c64_m2 iter_args(%remaining_m2 = %c1000_i32_m2) -> (i32) { + %mask_m2, %next_remaining_m2 = pto.plt_b32 %remaining_m2 : i32 -> !pto.mask, i32 + %vec_m2 = pto.vlds %ub_in_m2[%offset_m2] : !pto.ptr -> !pto.vreg<64xf32> + %sum_m2 = pto.vlrelu %vec_m2, %cst_m2, %mask_m2 : !pto.vreg<64xf32>, f32, !pto.mask -> !pto.vreg<64xf32> + pto.vsts %sum_m2, %ub_out_m2[%offset_m2], %mask_m2 : !pto.vreg<64xf32>, !pto.ptr, !pto.mask + scf.yield %next_remaining_m2 : i32 + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m2, %arg5, %c128_i64_m2 + nburst(%c32_i64_m2, %c128_i64_m2, %c128_i64_m2) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + + } + // active merged from vec_add_scalar_kernel_2d + + %c0_m3 = arith.constant 0 : index + %c1_m3 = arith.constant 1 : index + %c64_m3 = arith.constant 64 : index + %c1024_m3 = arith.constant 1024 : index + %c0_i64_m3 = arith.constant 0 : i64 + %c1_i64_m3 = arith.constant 1 : i64 + %c32_i64_m3 = arith.constant 32 : i64 + %c128_i64_m3 = arith.constant 128 : i64 + %c4096_i64_m3 = arith.constant 4096 : i64 + %c1024_i32_m3 = arith.constant 1024 : i32 + %cst_m3 = arith.constant 1.250000e-01 : f32 + + %ub_in_m3 = pto.castptr %c0_i64_m3 : i64 -> !pto.ptr + %ub_out_m3 = pto.castptr %c4096_i64_m3 : i64 -> !pto.ptr + + %false_m3 = arith.constant false + pto.mte_gm_ub %arg6, %ub_in_m3, %c0_i64_m3, %c128_i64_m3 + nburst(%c32_i64_m3, %c128_i64_m3, %c128_i64_m3) : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] pto.vecscope { - %_:1 = scf.for %offset = %c0 to %c1024 step %c64 iter_args(%remaining = %c1024_i32) -> (i32) { - %mask, %next_remaining = pto.plt_b32 %remaining : i32 -> !pto.mask, i32 - %vec = pto.vlds %ub_in[%offset] : !pto.ptr -> !pto.vreg<64xf32> - %sum = pto.vlrelu %vec, %cst, %mask : !pto.vreg<64xf32>, f32, !pto.mask -> !pto.vreg<64xf32> - pto.vsts %sum, %ub_out[%offset], %mask : !pto.vreg<64xf32>, !pto.ptr, !pto.mask - scf.yield %next_remaining : i32 + %__m3:1 = scf.for %offset_m3 = %c0_m3 to %c1024_m3 step %c64_m3 iter_args(%remaining_m3 = %c1024_i32_m3) -> (i32) { + %mask_m3, %next_remaining_m3 = pto.plt_b32 %remaining_m3 : i32 -> !pto.mask, i32 + %vec_m3 = pto.vlds %ub_in_m3[%offset_m3] : !pto.ptr -> !pto.vreg<64xf32> + %sum_m3 = pto.vlrelu %vec_m3, %cst_m3, %mask_m3 : !pto.vreg<64xf32>, f32, !pto.mask -> !pto.vreg<64xf32> + pto.vsts %sum_m3, %ub_out_m3[%offset_m3], %mask_m3 : !pto.vreg<64xf32>, !pto.ptr, !pto.mask + scf.yield %next_remaining_m3 : i32 } } pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg1, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) + pto.mte_ub_gm %ub_out_m3, %arg7, %c128_i64_m3 + nburst(%c32_i64_m3, %c128_i64_m3, %c128_i64_m3) : !pto.ptr, !pto.ptr, i64, i64, i64, i64 pto.barrier #pto.pipe + return } } diff --git a/test/vpto/cases/micro-op/dsa-sfu/vlrelu-f32/launch.cpp b/test/vpto/cases/micro-op/dsa-sfu/vlrelu-f32/launch.cpp index 44c07c249..26fc03a1d 100644 --- a/test/vpto/cases/micro-op/dsa-sfu/vlrelu-f32/launch.cpp +++ b/test/vpto/cases/micro-op/dsa-sfu/vlrelu-f32/launch.cpp @@ -5,11 +5,9 @@ // THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, // INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. // See LICENSE in the root of the software repository for the full text of the License. - #ifndef __VEC_SCOPE__ #define __VEC_SCOPE__ #endif - #if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) typedef struct { unsigned char v; } hifloat8_t; typedef struct { unsigned char v; } float8_e4m3_t; @@ -22,23 +20,31 @@ typedef struct { unsigned char v; } float4_e2m1x2_t; #if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) #include #endif - #if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; +struct MrgSortExecutedNumList { uint16_t mrgSortList0,mrgSortList1,mrgSortList2,mrgSortList3; }; #endif #ifndef __CPU_SIM #include "acl/acl.h" #endif -extern "C" __global__ [aicore] void vec_add_scalar_kernel_2d(__gm__ float *v1, - __gm__ float *v2); +extern "C" __global__ [aicore] void vlrelu_f32_deep_merged_kernel( + __gm__ float * arg0, + __gm__ float * arg1, + __gm__ half * arg2, + __gm__ half * arg3, + __gm__ float * arg4, + __gm__ float * arg5, + __gm__ float * arg6, + __gm__ float * arg7); -void LaunchVec_add_scalar_kernel_2d(float *v1, float *v2, void *stream) { - vec_add_scalar_kernel_2d<<<1, nullptr, stream>>>((__gm__ float *)v1, - (__gm__ float *)v2); +void LaunchVlreluF32DeepMerged(float * p0, float * p1, void *stream) { + vlrelu_f32_deep_merged_kernel<<<1, nullptr, stream>>>( + (__gm__ float *)p0, + (__gm__ float *)p0, + (__gm__ half *)p0, + (__gm__ half *)p0, + (__gm__ float *)p0, + (__gm__ float *)p0, + (__gm__ float *)p0, + (__gm__ float *)p1); } diff --git a/test/vpto/cases/micro-op/dsa-sfu/vlrelu-f32/main.cpp b/test/vpto/cases/micro-op/dsa-sfu/vlrelu-f32/main.cpp index fcb42331f..4ce2299ce 100644 --- a/test/vpto/cases/micro-op/dsa-sfu/vlrelu-f32/main.cpp +++ b/test/vpto/cases/micro-op/dsa-sfu/vlrelu-f32/main.cpp @@ -28,8 +28,8 @@ using namespace PtoTestCommon; } \ } while (0) -void LaunchVec_add_scalar_kernel_2d(float *v1, float *v2, void *stream); +void LaunchVlreluF32DeepMerged(float * p0, float * p1, void *stream); int main() { size_t elemCount_v1 = 1024; size_t fileSize_v1 = elemCount_v1 * sizeof(float); @@ -62,7 +62,7 @@ int main() { ACL_MEMCPY_HOST_TO_DEVICE)); ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVec_add_scalar_kernel_2d(v1Device, v2Device, stream); + LaunchVlreluF32DeepMerged(v1Device, v2Device, stream); ACL_CHECK(aclrtSynchronizeStream(stream)); ACL_CHECK(aclrtMemcpy(v2Host, fileSize_v2, v2Device, fileSize_v2, ACL_MEMCPY_DEVICE_TO_HOST)); diff --git a/test/vpto/cases/micro-op/dsa-sfu/vlrelu-tail/compare.py b/test/vpto/cases/micro-op/dsa-sfu/vlrelu-tail/compare.py deleted file mode 100644 index c13d79273..000000000 --- a/test/vpto/cases/micro-op/dsa-sfu/vlrelu-tail/compare.py +++ /dev/null @@ -1,39 +0,0 @@ -#!/usr/bin/python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - - -import os -import sys -import numpy as np - - -def compare_bin_prefix(golden_path, output_path, dtype, eps, count): - if not os.path.exists(golden_path) or not os.path.exists(output_path): - return False - golden = np.fromfile(golden_path, dtype=dtype, count=count) - output = np.fromfile(output_path, dtype=dtype, count=count) - return golden.shape == output.shape and np.allclose( - golden, output, atol=eps, rtol=eps, equal_nan=True - ) - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_bin_prefix("golden_v2.bin", "v2.bin", np.float32, 1e-4, 1000) - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/dsa-sfu/vlrelu-tail/golden.py b/test/vpto/cases/micro-op/dsa-sfu/vlrelu-tail/golden.py deleted file mode 100644 index 2544a92ff..000000000 --- a/test/vpto/cases/micro-op/dsa-sfu/vlrelu-tail/golden.py +++ /dev/null @@ -1,51 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -ROWS = 32 -COLS = 32 -SEED = 19 -ALPHA = np.float32(0.125) -LOGICAL_ELEMS = 1000 -OUT_SENTINEL = np.float32(-123.25) - - -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) - v1 = rng.uniform(-8.0, 8.0, size=(ROWS, COLS)).astype(np.float32) - v2 = np.full((ROWS, COLS), OUT_SENTINEL, dtype=np.float32) - golden_v2 = np.full((ROWS, COLS), OUT_SENTINEL, dtype=np.float32) - flat = v1.reshape(-1) - golden_v2.reshape(-1)[:LOGICAL_ELEMS] = np.where( - flat[:LOGICAL_ELEMS] >= 0.0, flat[:LOGICAL_ELEMS], flat[:LOGICAL_ELEMS] * ALPHA - ).astype(np.float32, copy=False) - - output_dir.mkdir(parents=True, exist_ok=True) - v1.reshape(-1).tofile(output_dir / "v1.bin") - v2.reshape(-1).tofile(output_dir / "v2.bin") - golden_v2.reshape(-1).tofile(output_dir / "golden_v2.bin") - - -def main() -> None: - parser = argparse.ArgumentParser() - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/dsa-sfu/vlrelu-tail/kernel.pto b/test/vpto/cases/micro-op/dsa-sfu/vlrelu-tail/kernel.pto deleted file mode 100644 index abecf2979..000000000 --- a/test/vpto/cases/micro-op/dsa-sfu/vlrelu-tail/kernel.pto +++ /dev/null @@ -1,44 +0,0 @@ -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vadds_tail_kernel_2d(%arg0: !pto.ptr, - %arg1: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c64 = arith.constant 64 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c128_i64 = arith.constant 128 : i64 - %c4096_i64 = arith.constant 4096 : i64 - %c1000_i32 = arith.constant 1000 : i32 - %cst = arith.constant 1.250000e-01 : f32 - - %ub_in = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c4096_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_in, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %_:1 = scf.for %offset = %c0 to %c1024 step %c64 iter_args(%remaining = %c1000_i32) -> (i32) { - %mask, %next_remaining = pto.plt_b32 %remaining : i32 -> !pto.mask, i32 - %vec = pto.vlds %ub_in[%offset] : !pto.ptr -> !pto.vreg<64xf32> - %sum = pto.vlrelu %vec, %cst, %mask : !pto.vreg<64xf32>, f32, !pto.mask -> !pto.vreg<64xf32> - pto.vsts %sum, %ub_out[%offset], %mask : !pto.vreg<64xf32>, !pto.ptr, !pto.mask - scf.yield %next_remaining : i32 - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg1, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/dsa-sfu/vlrelu-tail/launch.cpp b/test/vpto/cases/micro-op/dsa-sfu/vlrelu-tail/launch.cpp deleted file mode 100644 index b4cd46470..000000000 --- a/test/vpto/cases/micro-op/dsa-sfu/vlrelu-tail/launch.cpp +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vadds_tail_kernel_2d(__gm__ float *v1, - __gm__ float *v2); - -void LaunchVadds_tail_kernel_2d(float *v1, float *v2, void *stream) { - vadds_tail_kernel_2d<<<1, nullptr, stream>>>((__gm__ float *)v1, - (__gm__ float *)v2); -} diff --git a/test/vpto/cases/micro-op/dsa-sfu/vlrelu-tail/main.cpp b/test/vpto/cases/micro-op/dsa-sfu/vlrelu-tail/main.cpp deleted file mode 100644 index ab77e6b1a..000000000 --- a/test/vpto/cases/micro-op/dsa-sfu/vlrelu-tail/main.cpp +++ /dev/null @@ -1,83 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include - -using namespace PtoTestCommon; - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, \ - (int)_ret, __FILE__, __LINE__); \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVadds_tail_kernel_2d(float *v1, float *v2, void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(float); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(float); - float *v1Host = nullptr; - float *v1Device = nullptr; - float *v2Host = nullptr; - float *v2Device = nullptr; - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) - deviceId = std::atoi(envDevice); - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, - ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVadds_tail_kernel_2d(v1Device, v2Device, stream); - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v2Host, fileSize_v2, v2Device, fileSize_v2, - ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v2.bin", v2Host, fileSize_v2); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - if (stream != nullptr) - aclrtDestroyStream(stream); - if (deviceSet) - aclrtResetDevice(deviceId); - if (aclInited) - aclFinalize(); - return rc; -} diff --git a/test/vpto/cases/micro-op/dsa-sfu/vmula-accumulator-boundary/compare.py b/test/vpto/cases/micro-op/dsa-sfu/vmula-accumulator-boundary/compare.py deleted file mode 100755 index e7e8af91d..000000000 --- a/test/vpto/cases/micro-op/dsa-sfu/vmula-accumulator-boundary/compare.py +++ /dev/null @@ -1,54 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/dsa-sfu/vmula-accumulator-boundary -# family: dsa-sfu -# target_ops: pto.vmula -# scenarios: core-f32, fused-op, accumulator -# NOTE: bulk-generated coverage skeleton. - -import os -import sys -import numpy as np - -ACTIVE_ELEMS = 65 - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(golden_path) or not os.path.exists(output_path): - return False - golden = np.fromfile(golden_path, dtype=dtype) - output = np.fromfile(output_path, dtype=dtype) - return golden.shape == output.shape and np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True) - - -def compare_bin_prefix(golden_path, output_path, dtype, eps, count): - if not os.path.exists(golden_path) or not os.path.exists(output_path): - return False - golden = np.fromfile(golden_path, dtype=dtype, count=count) - output = np.fromfile(output_path, dtype=dtype, count=count) - return golden.size == count and output.size == count and np.allclose( - golden, output, atol=eps, rtol=eps, equal_nan=True - ) - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_bin_prefix("golden_v2.bin", "v2.bin", np.float32, 1e-4, ACTIVE_ELEMS) - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/dsa-sfu/vmula-accumulator-boundary/golden.py b/test/vpto/cases/micro-op/dsa-sfu/vmula-accumulator-boundary/golden.py deleted file mode 100755 index 6c0d8c252..000000000 --- a/test/vpto/cases/micro-op/dsa-sfu/vmula-accumulator-boundary/golden.py +++ /dev/null @@ -1,48 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/dsa-sfu/vmula-accumulator-boundary -# family: dsa-sfu -# target_ops: pto.vmula -# scenarios: core-f32, fused-op, accumulator, boundary -# NOTE: bulk-generated coverage skeleton. -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -ROWS = 32 -COLS = 32 -SEED = 19 -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) - v1 = rng.uniform(-8.0, 8.0, size=(ROWS, COLS)).astype(np.float32) - v2 = np.zeros((ROWS, COLS), dtype=np.float32) - golden_v2 = (v1 + np.abs(v1) * np.abs(v1)).astype(np.float32, copy=False) - golden_v2.reshape(-1)[65:] = 0.0 - - output_dir.mkdir(parents=True, exist_ok=True) - v1.reshape(-1).tofile(output_dir / "v1.bin") - v2.reshape(-1).tofile(output_dir / "v2.bin") - golden_v2.reshape(-1).tofile(output_dir / "golden_v2.bin") - - -def main() -> None: - parser = argparse.ArgumentParser() - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/dsa-sfu/vmula-accumulator-boundary/kernel.pto b/test/vpto/cases/micro-op/dsa-sfu/vmula-accumulator-boundary/kernel.pto deleted file mode 100644 index 9b1ba1b38..000000000 --- a/test/vpto/cases/micro-op/dsa-sfu/vmula-accumulator-boundary/kernel.pto +++ /dev/null @@ -1,52 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/dsa-sfu/vmula-accumulator-boundary -// family: dsa-sfu -// target_ops: pto.vmula -// scenarios: core-f32, fused-op, accumulator, boundary -// ----------------------------------------------------------------------------- -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vec_add_scalar_kernel_2d(%arg0: !pto.ptr, - %arg1: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c64 = arith.constant 64 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c128_i64 = arith.constant 128 : i64 - %c4096_i64 = arith.constant 4096 : i64 - %c65_i32 = arith.constant 65 : i32 - - %ub_in = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c4096_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_in, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %_:1 = scf.for %offset = %c0 to %c1024 step %c64 iter_args(%remaining = %c65_i32) -> (i32) { - %mask, %next_remaining = pto.plt_b32 %remaining : i32 -> !pto.mask, i32 - %acc = pto.vlds %ub_in[%offset] : !pto.ptr -> !pto.vreg<64xf32> - %lhs = pto.vabs %acc, %mask : !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> - %rhs = pto.vabs %lhs, %mask : !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> - %sum = pto.vmula %acc, %lhs, %rhs, %mask : !pto.vreg<64xf32>, !pto.vreg<64xf32>, !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> - pto.vsts %sum, %ub_out[%offset], %mask : !pto.vreg<64xf32>, !pto.ptr, !pto.mask - scf.yield %next_remaining : i32 - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg1, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/dsa-sfu/vmula-accumulator-boundary/launch.cpp b/test/vpto/cases/micro-op/dsa-sfu/vmula-accumulator-boundary/launch.cpp deleted file mode 100644 index 8dcf35197..000000000 --- a/test/vpto/cases/micro-op/dsa-sfu/vmula-accumulator-boundary/launch.cpp +++ /dev/null @@ -1,52 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/dsa-sfu/vmula-accumulator-boundary -// family: dsa-sfu -// target_ops: pto.vmula -// scenarios: core-f32, fused-op, accumulator -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vec_add_scalar_kernel_2d(__gm__ float *v1, - __gm__ float *v2); - -void LaunchVec_add_scalar_kernel_2d(float *v1, float *v2, void *stream) { - vec_add_scalar_kernel_2d<<<1, nullptr, stream>>>((__gm__ float *)v1, - (__gm__ float *)v2); -} diff --git a/test/vpto/cases/micro-op/dsa-sfu/vmula-accumulator-boundary/main.cpp b/test/vpto/cases/micro-op/dsa-sfu/vmula-accumulator-boundary/main.cpp deleted file mode 100644 index c9b1f36c3..000000000 --- a/test/vpto/cases/micro-op/dsa-sfu/vmula-accumulator-boundary/main.cpp +++ /dev/null @@ -1,91 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/dsa-sfu/vmula-accumulator-boundary -// family: dsa-sfu -// target_ops: pto.vmula -// scenarios: core-f32, fused-op, accumulator -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include - -using namespace PtoTestCommon; - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, \ - (int)_ret, __FILE__, __LINE__); \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVec_add_scalar_kernel_2d(float *v1, float *v2, void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(float); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(float); - float *v1Host = nullptr; - float *v1Device = nullptr; - float *v2Host = nullptr; - float *v2Device = nullptr; - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) - deviceId = std::atoi(envDevice); - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, - ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVec_add_scalar_kernel_2d(v1Device, v2Device, stream); - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v2Host, fileSize_v2, v2Device, fileSize_v2, - ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v2.bin", v2Host, fileSize_v2); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - if (stream != nullptr) - aclrtDestroyStream(stream); - if (deviceSet) - aclrtResetDevice(deviceId); - if (aclInited) - aclFinalize(); - return rc; -} diff --git a/test/vpto/cases/micro-op/dsa-sfu/vmula/kernel.pto b/test/vpto/cases/micro-op/dsa-sfu/vmula/kernel.pto index 41708a815..5b9983ce9 100644 --- a/test/vpto/cases/micro-op/dsa-sfu/vmula/kernel.pto +++ b/test/vpto/cases/micro-op/dsa-sfu/vmula/kernel.pto @@ -1,52 +1,94 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/dsa-sfu/vmula -// family: dsa-sfu -// target_ops: pto.vmula -// scenarios: core-f32, fused-op, accumulator -// ----------------------------------------------------------------------------- module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vec_add_scalar_kernel_2d(%arg0: !pto.ptr, - %arg1: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c64 = arith.constant 64 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c128_i64 = arith.constant 128 : i64 - %c4096_i64 = arith.constant 4096 : i64 - %c1024_i32 = arith.constant 1024 : i32 - - %ub_in = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c4096_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_in, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) + func.func @vmula_deep_merged_kernel(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: !pto.ptr) attributes {pto.kernel} { + %__deep_merge_guard = arith.constant false + // inactive merged from vec_add_scalar_kernel_2d_vmula_accumulator_boundary + scf.if %__deep_merge_guard { + + %c0_m0 = arith.constant 0 : index + %c1_m0 = arith.constant 1 : index + %c64_m0 = arith.constant 64 : index + %c1024_m0 = arith.constant 1024 : index + %c0_i64_m0 = arith.constant 0 : i64 + %c1_i64_m0 = arith.constant 1 : i64 + %c32_i64_m0 = arith.constant 32 : i64 + %c128_i64_m0 = arith.constant 128 : i64 + %c4096_i64_m0 = arith.constant 4096 : i64 + %c65_i32_m0 = arith.constant 65 : i32 + + %ub_in_m0 = pto.castptr %c0_i64_m0 : i64 -> !pto.ptr + %ub_out_m0 = pto.castptr %c4096_i64_m0 : i64 -> !pto.ptr + + %false_m0 = arith.constant false + pto.mte_gm_ub %arg0, %ub_in_m0, %c0_i64_m0, %c128_i64_m0 + nburst(%c32_i64_m0, %c128_i64_m0, %c128_i64_m0) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %__m0:1 = scf.for %offset_m0 = %c0_m0 to %c1024_m0 step %c64_m0 iter_args(%remaining_m0 = %c65_i32_m0) -> (i32) { + %mask_m0, %next_remaining_m0 = pto.plt_b32 %remaining_m0 : i32 -> !pto.mask, i32 + %acc_m0 = pto.vlds %ub_in_m0[%offset_m0] : !pto.ptr -> !pto.vreg<64xf32> + %lhs_m0 = pto.vabs %acc_m0, %mask_m0 : !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> + %rhs_m0 = pto.vabs %lhs_m0, %mask_m0 : !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> + %sum_m0 = pto.vmula %acc_m0, %lhs_m0, %rhs_m0, %mask_m0 : !pto.vreg<64xf32>, !pto.vreg<64xf32>, !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> + pto.vsts %sum_m0, %ub_out_m0[%offset_m0], %mask_m0 : !pto.vreg<64xf32>, !pto.ptr, !pto.mask + scf.yield %next_remaining_m0 : i32 + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m0, %arg1, %c128_i64_m0 + nburst(%c32_i64_m0, %c128_i64_m0, %c128_i64_m0) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + + } + // active merged from vec_add_scalar_kernel_2d + + %c0_m1 = arith.constant 0 : index + %c1_m1 = arith.constant 1 : index + %c64_m1 = arith.constant 64 : index + %c1024_m1 = arith.constant 1024 : index + %c0_i64_m1 = arith.constant 0 : i64 + %c1_i64_m1 = arith.constant 1 : i64 + %c32_i64_m1 = arith.constant 32 : i64 + %c128_i64_m1 = arith.constant 128 : i64 + %c4096_i64_m1 = arith.constant 4096 : i64 + %c1024_i32_m1 = arith.constant 1024 : i32 + + %ub_in_m1 = pto.castptr %c0_i64_m1 : i64 -> !pto.ptr + %ub_out_m1 = pto.castptr %c4096_i64_m1 : i64 -> !pto.ptr + + %false_m1 = arith.constant false + pto.mte_gm_ub %arg2, %ub_in_m1, %c0_i64_m1, %c128_i64_m1 + nburst(%c32_i64_m1, %c128_i64_m1, %c128_i64_m1) : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] pto.vecscope { - %_:1 = scf.for %offset = %c0 to %c1024 step %c64 iter_args(%remaining = %c1024_i32) -> (i32) { - %mask, %next_remaining = pto.plt_b32 %remaining : i32 -> !pto.mask, i32 - %acc = pto.vlds %ub_in[%offset] : !pto.ptr -> !pto.vreg<64xf32> - %lhs = pto.vabs %acc, %mask : !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> - %rhs = pto.vabs %lhs, %mask : !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> - %sum = pto.vmula %acc, %lhs, %rhs, %mask : !pto.vreg<64xf32>, !pto.vreg<64xf32>, !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> - pto.vsts %sum, %ub_out[%offset], %mask : !pto.vreg<64xf32>, !pto.ptr, !pto.mask - scf.yield %next_remaining : i32 + %__m1:1 = scf.for %offset_m1 = %c0_m1 to %c1024_m1 step %c64_m1 iter_args(%remaining_m1 = %c1024_i32_m1) -> (i32) { + %mask_m1, %next_remaining_m1 = pto.plt_b32 %remaining_m1 : i32 -> !pto.mask, i32 + %acc_m1 = pto.vlds %ub_in_m1[%offset_m1] : !pto.ptr -> !pto.vreg<64xf32> + %lhs_m1 = pto.vabs %acc_m1, %mask_m1 : !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> + %rhs_m1 = pto.vabs %lhs_m1, %mask_m1 : !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> + %sum_m1 = pto.vmula %acc_m1, %lhs_m1, %rhs_m1, %mask_m1 : !pto.vreg<64xf32>, !pto.vreg<64xf32>, !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> + pto.vsts %sum_m1, %ub_out_m1[%offset_m1], %mask_m1 : !pto.vreg<64xf32>, !pto.ptr, !pto.mask + scf.yield %next_remaining_m1 : i32 } } pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg1, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) + pto.mte_ub_gm %ub_out_m1, %arg3, %c128_i64_m1 + nburst(%c32_i64_m1, %c128_i64_m1, %c128_i64_m1) : !pto.ptr, !pto.ptr, i64, i64, i64, i64 pto.barrier #pto.pipe + return } } diff --git a/test/vpto/cases/micro-op/dsa-sfu/vmula/launch.cpp b/test/vpto/cases/micro-op/dsa-sfu/vmula/launch.cpp index fa6ae4bb5..90e11c107 100644 --- a/test/vpto/cases/micro-op/dsa-sfu/vmula/launch.cpp +++ b/test/vpto/cases/micro-op/dsa-sfu/vmula/launch.cpp @@ -5,19 +5,9 @@ // THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, // INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. // See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/dsa-sfu/vmula -// family: dsa-sfu -// target_ops: pto.vmula -// scenarios: core-f32, fused-op, accumulator -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- #ifndef __VEC_SCOPE__ #define __VEC_SCOPE__ #endif - #if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) typedef struct { unsigned char v; } hifloat8_t; typedef struct { unsigned char v; } float8_e4m3_t; @@ -30,23 +20,23 @@ typedef struct { unsigned char v; } float4_e2m1x2_t; #if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) #include #endif - #if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; +struct MrgSortExecutedNumList { uint16_t mrgSortList0,mrgSortList1,mrgSortList2,mrgSortList3; }; #endif #ifndef __CPU_SIM #include "acl/acl.h" #endif -extern "C" __global__ [aicore] void vec_add_scalar_kernel_2d(__gm__ float *v1, - __gm__ float *v2); +extern "C" __global__ [aicore] void vmula_deep_merged_kernel( + __gm__ float * arg0, + __gm__ float * arg1, + __gm__ float * arg2, + __gm__ float * arg3); -void LaunchVec_add_scalar_kernel_2d(float *v1, float *v2, void *stream) { - vec_add_scalar_kernel_2d<<<1, nullptr, stream>>>((__gm__ float *)v1, - (__gm__ float *)v2); +void LaunchVmulaDeepMerged(float * p0, float * p1, void *stream) { + vmula_deep_merged_kernel<<<1, nullptr, stream>>>( + (__gm__ float *)p0, + (__gm__ float *)p0, + (__gm__ float *)p0, + (__gm__ float *)p1); } diff --git a/test/vpto/cases/micro-op/dsa-sfu/vmula/main.cpp b/test/vpto/cases/micro-op/dsa-sfu/vmula/main.cpp index 54508a12d..cb6cd45fc 100644 --- a/test/vpto/cases/micro-op/dsa-sfu/vmula/main.cpp +++ b/test/vpto/cases/micro-op/dsa-sfu/vmula/main.cpp @@ -36,8 +36,8 @@ using namespace PtoTestCommon; } \ } while (0) -void LaunchVec_add_scalar_kernel_2d(float *v1, float *v2, void *stream); +void LaunchVmulaDeepMerged(float * p0, float * p1, void *stream); int main() { size_t elemCount_v1 = 1024; size_t fileSize_v1 = elemCount_v1 * sizeof(float); @@ -70,7 +70,7 @@ int main() { ACL_MEMCPY_HOST_TO_DEVICE)); ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVec_add_scalar_kernel_2d(v1Device, v2Device, stream); + LaunchVmulaDeepMerged(v1Device, v2Device, stream); ACL_CHECK(aclrtSynchronizeStream(stream)); ACL_CHECK(aclrtMemcpy(v2Host, fileSize_v2, v2Device, fileSize_v2, ACL_MEMCPY_DEVICE_TO_HOST)); diff --git a/test/vpto/cases/micro-op/dsa-sfu/vprelu-f32/kernel.pto b/test/vpto/cases/micro-op/dsa-sfu/vprelu-f32/kernel.pto index cb1b92910..c01b5301f 100644 --- a/test/vpto/cases/micro-op/dsa-sfu/vprelu-f32/kernel.pto +++ b/test/vpto/cases/micro-op/dsa-sfu/vprelu-f32/kernel.pto @@ -1,56 +1,98 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/dsa-sfu/vprelu-f32 -// family: dsa-sfu -// target_ops: pto.vprelu -// scenarios: core-f32, vector-alpha -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vprelu_kernel_2d(%arg0: !pto.ptr, - %arg1: !pto.ptr, - %arg2: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c64 = arith.constant 64 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c128_i64 = arith.constant 128 : i64 - %c4096_i64 = arith.constant 4096 : i64 - %c8192_i64 = arith.constant 8192 : i64 - - %ub_in = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_alpha = pto.castptr %c4096_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c8192_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_in, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) + func.func @vprelu_f32_deep_merged_kernel(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: !pto.ptr, %arg4: !pto.ptr, %arg5: !pto.ptr) attributes {pto.kernel} { + %__deep_merge_guard = arith.constant false + // inactive merged from vprelu_tail_kernel_2d + scf.if %__deep_merge_guard { + + %c0_m0 = arith.constant 0 : index + %c64_m0 = arith.constant 64 : index + %c1024_m0 = arith.constant 1024 : index + %c0_i64_m0 = arith.constant 0 : i64 + %c1_i64_m0 = arith.constant 1 : i64 + %c32_i64_m0 = arith.constant 32 : i64 + %c128_i64_m0 = arith.constant 128 : i64 + %c4096_i64_m0 = arith.constant 4096 : i64 + %c8192_i64_m0 = arith.constant 8192 : i64 + %c1000_i32_m0 = arith.constant 1000 : i32 + + %ub_in_m0 = pto.castptr %c0_i64_m0 : i64 -> !pto.ptr + %ub_alpha_m0 = pto.castptr %c4096_i64_m0 : i64 -> !pto.ptr + %ub_out_m0 = pto.castptr %c8192_i64_m0 : i64 -> !pto.ptr + + %false_m0 = arith.constant false + pto.mte_gm_ub %arg0, %ub_in_m0, %c0_i64_m0, %c128_i64_m0 + nburst(%c32_i64_m0, %c128_i64_m0, %c128_i64_m0) : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - pto.mte_gm_ub %arg1, %ub_alpha, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) + pto.mte_gm_ub %arg1, %ub_alpha_m0, %c0_i64_m0, %c128_i64_m0 + nburst(%c32_i64_m0, %c128_i64_m0, %c128_i64_m0) : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] pto.vecscope { - %mask = pto.pset_b32 "PAT_ALL" : !pto.mask - scf.for %offset = %c0 to %c1024 step %c64 { - %vec = pto.vlds %ub_in[%offset] : !pto.ptr -> !pto.vreg<64xf32> - %alpha = pto.vlds %ub_alpha[%offset] : !pto.ptr -> !pto.vreg<64xf32> - %sum = pto.vprelu %vec, %alpha, %mask : !pto.vreg<64xf32>, !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> - pto.vsts %sum, %ub_out[%offset], %mask : !pto.vreg<64xf32>, !pto.ptr, !pto.mask + %__m0:1 = scf.for %offset_m0 = %c0_m0 to %c1024_m0 step %c64_m0 iter_args(%remaining_m0 = %c1000_i32_m0) -> (i32) { + %mask_m0, %next_remaining_m0 = pto.plt_b32 %remaining_m0 : i32 -> !pto.mask, i32 + %vec_m0 = pto.vlds %ub_in_m0[%offset_m0] : !pto.ptr -> !pto.vreg<64xf32> + %alpha_m0 = pto.vlds %ub_alpha_m0[%offset_m0] : !pto.ptr -> !pto.vreg<64xf32> + %sum_m0 = pto.vprelu %vec_m0, %alpha_m0, %mask_m0 : !pto.vreg<64xf32>, !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> + pto.vsts %sum_m0, %ub_out_m0[%offset_m0], %mask_m0 : !pto.vreg<64xf32>, !pto.ptr, !pto.mask + scf.yield %next_remaining_m0 : i32 } } pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg2, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) + pto.mte_ub_gm %ub_out_m0, %arg2, %c128_i64_m0 + nburst(%c32_i64_m0, %c128_i64_m0, %c128_i64_m0) : !pto.ptr, !pto.ptr, i64, i64, i64, i64 pto.barrier #pto.pipe + + } + // active merged from vprelu_kernel_2d + + %c0_m1 = arith.constant 0 : index + %c64_m1 = arith.constant 64 : index + %c1024_m1 = arith.constant 1024 : index + %c0_i64_m1 = arith.constant 0 : i64 + %c1_i64_m1 = arith.constant 1 : i64 + %c32_i64_m1 = arith.constant 32 : i64 + %c128_i64_m1 = arith.constant 128 : i64 + %c4096_i64_m1 = arith.constant 4096 : i64 + %c8192_i64_m1 = arith.constant 8192 : i64 + + %ub_in_m1 = pto.castptr %c0_i64_m1 : i64 -> !pto.ptr + %ub_alpha_m1 = pto.castptr %c4096_i64_m1 : i64 -> !pto.ptr + %ub_out_m1 = pto.castptr %c8192_i64_m1 : i64 -> !pto.ptr + + %false_m1 = arith.constant false + pto.mte_gm_ub %arg3, %ub_in_m1, %c0_i64_m1, %c128_i64_m1 + nburst(%c32_i64_m1, %c128_i64_m1, %c128_i64_m1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + pto.mte_gm_ub %arg4, %ub_alpha_m1, %c0_i64_m1, %c128_i64_m1 + nburst(%c32_i64_m1, %c128_i64_m1, %c128_i64_m1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %mask_m1 = pto.pset_b32 "PAT_ALL" : !pto.mask + scf.for %offset_m1 = %c0_m1 to %c1024_m1 step %c64_m1 { + %vec_m1 = pto.vlds %ub_in_m1[%offset_m1] : !pto.ptr -> !pto.vreg<64xf32> + %alpha_m1 = pto.vlds %ub_alpha_m1[%offset_m1] : !pto.ptr -> !pto.vreg<64xf32> + %sum_m1 = pto.vprelu %vec_m1, %alpha_m1, %mask_m1 : !pto.vreg<64xf32>, !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> + pto.vsts %sum_m1, %ub_out_m1[%offset_m1], %mask_m1 : !pto.vreg<64xf32>, !pto.ptr, !pto.mask + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m1, %arg5, %c128_i64_m1 + nburst(%c32_i64_m1, %c128_i64_m1, %c128_i64_m1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + return } } diff --git a/test/vpto/cases/micro-op/dsa-sfu/vprelu-f32/launch.cpp b/test/vpto/cases/micro-op/dsa-sfu/vprelu-f32/launch.cpp index d6002ce63..9a0e97c79 100644 --- a/test/vpto/cases/micro-op/dsa-sfu/vprelu-f32/launch.cpp +++ b/test/vpto/cases/micro-op/dsa-sfu/vprelu-f32/launch.cpp @@ -5,19 +5,9 @@ // THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, // INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. // See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/dsa-sfu/vprelu-f32 -// family: dsa-sfu -// target_ops: pto.vprelu -// scenarios: core-f32, vector-alpha -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- #ifndef __VEC_SCOPE__ #define __VEC_SCOPE__ #endif - #if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) typedef struct { unsigned char v; } hifloat8_t; typedef struct { unsigned char v; } float8_e4m3_t; @@ -30,25 +20,27 @@ typedef struct { unsigned char v; } float4_e2m1x2_t; #if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) #include #endif - #if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; +struct MrgSortExecutedNumList { uint16_t mrgSortList0,mrgSortList1,mrgSortList2,mrgSortList3; }; #endif #ifndef __CPU_SIM #include "acl/acl.h" #endif -extern "C" __global__ [aicore] void vprelu_kernel_2d(__gm__ float *v1, - __gm__ float *v2, - __gm__ float *v3); +extern "C" __global__ [aicore] void vprelu_f32_deep_merged_kernel( + __gm__ float * arg0, + __gm__ float * arg1, + __gm__ float * arg2, + __gm__ float * arg3, + __gm__ float * arg4, + __gm__ float * arg5); -void LaunchVprelu_kernel_2d(float *v1, float *v2, float *v3, void *stream) { - vprelu_kernel_2d<<<1, nullptr, stream>>>((__gm__ float *)v1, - (__gm__ float *)v2, - (__gm__ float *)v3); +void LaunchVpreluF32DeepMerged(float * p0, float * p1, float * p2, void *stream) { + vprelu_f32_deep_merged_kernel<<<1, nullptr, stream>>>( + (__gm__ float *)p0, + (__gm__ float *)p0, + (__gm__ float *)p0, + (__gm__ float *)p0, + (__gm__ float *)p1, + (__gm__ float *)p2); } diff --git a/test/vpto/cases/micro-op/dsa-sfu/vprelu-f32/main.cpp b/test/vpto/cases/micro-op/dsa-sfu/vprelu-f32/main.cpp index 6a2738912..5c32e355e 100644 --- a/test/vpto/cases/micro-op/dsa-sfu/vprelu-f32/main.cpp +++ b/test/vpto/cases/micro-op/dsa-sfu/vprelu-f32/main.cpp @@ -36,8 +36,8 @@ using namespace PtoTestCommon; } \ } while (0) -void LaunchVprelu_kernel_2d(float *v1, float *v2, float *v3, void *stream); +void LaunchVpreluF32DeepMerged(float * p0, float * p1, float * p2, void *stream); int main() { size_t elemCount_v1 = 1024; size_t fileSize_v1 = elemCount_v1 * sizeof(float); @@ -79,7 +79,7 @@ int main() { ACL_MEMCPY_HOST_TO_DEVICE)); ACL_CHECK(aclrtMemcpy(v3Device, fileSize_v3, v3Host, fileSize_v3, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVprelu_kernel_2d(v1Device, v2Device, v3Device, stream); + LaunchVpreluF32DeepMerged(v1Device, v2Device, v3Device, stream); ACL_CHECK(aclrtSynchronizeStream(stream)); ACL_CHECK(aclrtMemcpy(v3Host, fileSize_v3, v3Device, fileSize_v3, ACL_MEMCPY_DEVICE_TO_HOST)); diff --git a/test/vpto/cases/micro-op/dsa-sfu/vprelu-tail/compare.py b/test/vpto/cases/micro-op/dsa-sfu/vprelu-tail/compare.py deleted file mode 100755 index bbc6ab65a..000000000 --- a/test/vpto/cases/micro-op/dsa-sfu/vprelu-tail/compare.py +++ /dev/null @@ -1,54 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/dsa-sfu/vprelu-tail -# family: dsa-sfu -# target_ops: pto.vprelu -# scenarios: core-f32, vector-alpha, tail-mask -# NOTE: bulk-generated coverage skeleton. - -import os -import sys -import numpy as np - -ACTIVE_ELEMS = 1000 - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(golden_path) or not os.path.exists(output_path): - return False - golden = np.fromfile(golden_path, dtype=dtype) - output = np.fromfile(output_path, dtype=dtype) - return golden.shape == output.shape and np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True) - - -def compare_bin_prefix(golden_path, output_path, dtype, eps, count): - if not os.path.exists(golden_path) or not os.path.exists(output_path): - return False - golden = np.fromfile(golden_path, dtype=dtype, count=count) - output = np.fromfile(output_path, dtype=dtype, count=count) - return golden.size == count and output.size == count and np.allclose( - golden, output, atol=eps, rtol=eps, equal_nan=True - ) - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_bin_prefix("golden_v3.bin", "v3.bin", np.float32, 1e-4, ACTIVE_ELEMS) - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/dsa-sfu/vprelu-tail/golden.py b/test/vpto/cases/micro-op/dsa-sfu/vprelu-tail/golden.py deleted file mode 100755 index a9e101569..000000000 --- a/test/vpto/cases/micro-op/dsa-sfu/vprelu-tail/golden.py +++ /dev/null @@ -1,50 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/dsa-sfu/vprelu-tail -# family: dsa-sfu -# target_ops: pto.vprelu -# scenarios: core-f32, vector-alpha, tail-mask -# NOTE: bulk-generated coverage skeleton. -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -ROWS = 32 -COLS = 32 -SEED = 19 -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) - v1 = rng.uniform(-8.0, 8.0, size=(ROWS, COLS)).astype(np.float32) - v2 = rng.uniform(0.05, 0.5, size=(ROWS, COLS)).astype(np.float32) - v3 = np.zeros((ROWS, COLS), dtype=np.float32) - golden_v3 = np.where(v1 >= 0.0, v1, v1 * v2).astype(np.float32, copy=False) - golden_v3.reshape(-1)[1000:] = 0.0 - - output_dir.mkdir(parents=True, exist_ok=True) - v1.reshape(-1).tofile(output_dir / "v1.bin") - v2.reshape(-1).tofile(output_dir / "v2.bin") - v3.reshape(-1).tofile(output_dir / "v3.bin") - golden_v3.reshape(-1).tofile(output_dir / "golden_v3.bin") - - -def main() -> None: - parser = argparse.ArgumentParser() - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/dsa-sfu/vprelu-tail/kernel.pto b/test/vpto/cases/micro-op/dsa-sfu/vprelu-tail/kernel.pto deleted file mode 100644 index 8bb0c56ac..000000000 --- a/test/vpto/cases/micro-op/dsa-sfu/vprelu-tail/kernel.pto +++ /dev/null @@ -1,58 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/dsa-sfu/vprelu-tail -// family: dsa-sfu -// target_ops: pto.vprelu -// scenarios: core-f32, vector-alpha, tail-mask -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vprelu_tail_kernel_2d(%arg0: !pto.ptr, - %arg1: !pto.ptr, - %arg2: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c64 = arith.constant 64 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c128_i64 = arith.constant 128 : i64 - %c4096_i64 = arith.constant 4096 : i64 - %c8192_i64 = arith.constant 8192 : i64 - %c1000_i32 = arith.constant 1000 : i32 - - %ub_in = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_alpha = pto.castptr %c4096_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c8192_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_in, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - pto.mte_gm_ub %arg1, %ub_alpha, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %_:1 = scf.for %offset = %c0 to %c1024 step %c64 iter_args(%remaining = %c1000_i32) -> (i32) { - %mask, %next_remaining = pto.plt_b32 %remaining : i32 -> !pto.mask, i32 - %vec = pto.vlds %ub_in[%offset] : !pto.ptr -> !pto.vreg<64xf32> - %alpha = pto.vlds %ub_alpha[%offset] : !pto.ptr -> !pto.vreg<64xf32> - %sum = pto.vprelu %vec, %alpha, %mask : !pto.vreg<64xf32>, !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> - pto.vsts %sum, %ub_out[%offset], %mask : !pto.vreg<64xf32>, !pto.ptr, !pto.mask - scf.yield %next_remaining : i32 - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg2, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/dsa-sfu/vprelu-tail/launch.cpp b/test/vpto/cases/micro-op/dsa-sfu/vprelu-tail/launch.cpp deleted file mode 100644 index b4a675c6a..000000000 --- a/test/vpto/cases/micro-op/dsa-sfu/vprelu-tail/launch.cpp +++ /dev/null @@ -1,54 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/dsa-sfu/vprelu-tail -// family: dsa-sfu -// target_ops: pto.vprelu -// scenarios: core-f32, vector-alpha, tail-mask -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vprelu_tail_kernel_2d(__gm__ float *v1, - __gm__ float *v2, - __gm__ float *v3); - -void LaunchVprelu_tail_kernel_2d(float *v1, float *v2, float *v3, void *stream) { - vprelu_tail_kernel_2d<<<1, nullptr, stream>>>((__gm__ float *)v1, - (__gm__ float *)v2, - (__gm__ float *)v3); -} diff --git a/test/vpto/cases/micro-op/dsa-sfu/vprelu-tail/main.cpp b/test/vpto/cases/micro-op/dsa-sfu/vprelu-tail/main.cpp deleted file mode 100644 index 27b55f701..000000000 --- a/test/vpto/cases/micro-op/dsa-sfu/vprelu-tail/main.cpp +++ /dev/null @@ -1,102 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/dsa-sfu/vprelu-tail -// family: dsa-sfu -// target_ops: pto.vprelu -// scenarios: core-f32, vector-alpha, tail-mask -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include - -using namespace PtoTestCommon; - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, \ - (int)_ret, __FILE__, __LINE__); \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVprelu_tail_kernel_2d(float *v1, float *v2, float *v3, void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(float); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(float); - size_t elemCount_v3 = 1024; - size_t fileSize_v3 = elemCount_v3 * sizeof(float); - float *v1Host = nullptr; - float *v1Device = nullptr; - float *v2Host = nullptr; - float *v2Device = nullptr; - float *v3Host = nullptr; - float *v3Device = nullptr; - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) - deviceId = std::atoi(envDevice); - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMallocHost((void **)(&v3Host), fileSize_v3)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v3Device, fileSize_v3, ACL_MEM_MALLOC_HUGE_FIRST)); - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ReadFile("./v3.bin", fileSize_v3, v3Host, fileSize_v3); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v3Device, fileSize_v3, v3Host, fileSize_v3, - ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVprelu_tail_kernel_2d(v1Device, v2Device, v3Device, stream); - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v3Host, fileSize_v3, v3Device, fileSize_v3, - ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v3.bin", v3Host, fileSize_v3); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFree(v3Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - aclrtFreeHost(v3Host); - if (stream != nullptr) - aclrtDestroyStream(stream); - if (deviceSet) - aclrtResetDevice(deviceId); - if (aclInited) - aclFinalize(); - return rc; -} diff --git a/test/vpto/cases/micro-op/gather-scatter/vgather2-duplicate-index/compare.py b/test/vpto/cases/micro-op/gather-scatter/vgather2-duplicate-index/compare.py deleted file mode 100755 index c932750d2..000000000 --- a/test/vpto/cases/micro-op/gather-scatter/vgather2-duplicate-index/compare.py +++ /dev/null @@ -1,209 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/gather-scatter/vgather2-duplicate-index -# family: gather-scatter -# target_ops: pto.vgather2 -# scenarios: core-f32, non-contiguous, explicit-index-pattern, load-effect-validation, no-alias -# NOTE: bulk-generated coverage skeleton. -# coding=utf-8 - -import os -import sys -import numpy as np - - -REPEAT_BYTES = 256 - - -def _ceil_div(x, y): - return (x + y - 1) // y - - -def _packed_pred_storage_bytes(logical_elems, src_elem_bytes): - logical_elems = int(logical_elems) - src_elem_bytes = int(src_elem_bytes) - if logical_elems <= 0: - raise ValueError(f"logical_elems must be > 0, got {logical_elems}") - if src_elem_bytes not in (1, 2, 4): - raise ValueError(f"unsupported packed predicate source size: {src_elem_bytes}") - - repeat_elems = REPEAT_BYTES // src_elem_bytes - if src_elem_bytes == 4: - repeat_times = _ceil_div(logical_elems, repeat_elems) + 1 - loop_count = repeat_times // 2 - return loop_count * 16 - - repeat_times = _ceil_div(logical_elems, repeat_elems) - return repeat_times * (repeat_elems // 8) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - g = golden.astype(np.int64, copy=False) - o = output.astype(np.int64, copy=False) - else: - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - abs_diff = np.abs(g - o) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={g[idx]}, out={o[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_bin_prefix(golden_path, output_path, dtype, eps, count): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - try: - count = int(count) - except Exception: - print(f"[ERROR] Invalid prefix count: {count}") - return False - if count <= 0: - print(f"[ERROR] Invalid prefix count: {count}") - return False - - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np, count=count) - output = np.fromfile(output_path, dtype=dtype_np, count=count) - - if golden.size != count or output.size != count: - print( - f"[ERROR] Prefix read too small: need={count} elems, " - f"golden={golden.size}, out={output.size}" - ) - return False - - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - g = golden.astype(np.int64, copy=False) - o = output.astype(np.int64, copy=False) - else: - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - abs_diff = np.abs(g - o) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch (prefix): {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={g[idx]}, out={o[idx]}, dtype={dtype_np}, count={count})" - ) - else: - print(f"[ERROR] Mismatch (prefix): {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, logical_elems, src_elem_bytes): - """ - Compare outputs of pto.tcmp / pto.tcmps. - - PTO-ISA stores packed predicate results as a linear PK byte stream via - `psts`, with the exact written prefix length determined by the typed - TCMP/TCMPS repeat schedule. Compare only that semantic prefix. - """ - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - try: - logical_elems = int(logical_elems) - src_elem_bytes = int(src_elem_bytes) - except Exception: - print( - "[ERROR] Invalid packed mask compare arguments: " - f"logical_elems={logical_elems} src_elem_bytes={src_elem_bytes}" - ) - return False - if logical_elems <= 0 or src_elem_bytes <= 0: - print( - "[ERROR] Invalid packed mask compare arguments: " - f"logical_elems={logical_elems} src_elem_bytes={src_elem_bytes}" - ) - return False - - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - try: - prefix_bytes = _packed_pred_storage_bytes(logical_elems, src_elem_bytes) - except ValueError as exc: - print(f"[ERROR] {exc}") - return False - - if golden.size < prefix_bytes or output.size < prefix_bytes: - print( - f"[ERROR] Packed mask buffer too small: need={prefix_bytes} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - - golden_sel = golden[:prefix_bytes] - output_sel = output[:prefix_bytes] - - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, " - f"logical_elems={logical_elems}, src_elem_bytes={src_elem_bytes}, prefix_bytes={prefix_bytes})" - ) - return False - return True - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = True - ok = compare_bin("golden_v3.bin", "v3.bin", np.float32, 0.0001) and ok - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/gather-scatter/vgather2-duplicate-index/golden.py b/test/vpto/cases/micro-op/gather-scatter/vgather2-duplicate-index/golden.py deleted file mode 100755 index 4a5c343b6..000000000 --- a/test/vpto/cases/micro-op/gather-scatter/vgather2-duplicate-index/golden.py +++ /dev/null @@ -1,60 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/gather-scatter/vgather2-duplicate-index -# family: gather-scatter -# target_ops: pto.vgather2 -# scenarios: core-f32, non-contiguous, explicit-index-pattern, load-effect-validation, no-alias -# NOTE: bulk-generated coverage skeleton. -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -ROWS = 32 -COLS = 32 -SEED = 19 - - -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) - flat = rng.uniform(-8.0, 8.0, size=(ROWS * COLS,)).astype(np.float32) - pair_ids = ((np.arange((ROWS * COLS) // 2, dtype=np.int32) * 29) + 5) % (ROWS * COLS) - offsets = np.repeat(pair_ids, 2) - gathered = np.zeros((ROWS * COLS,), dtype=np.float32) - for base in range(0, ROWS * COLS, 64): - lanes = np.arange(base + 8, base + 64, dtype=np.int32) - gathered[lanes] = flat[offsets[lanes]] - gathered = gathered.reshape(ROWS, COLS) - v1 = flat.reshape(ROWS, COLS) - v2 = offsets.reshape(ROWS, COLS) - v3 = np.zeros((ROWS, COLS), dtype=np.float32) - - output_dir.mkdir(parents=True, exist_ok=True) - v1.reshape(-1).tofile(output_dir / "v1.bin") - v2.reshape(-1).tofile(output_dir / "v2.bin") - v3.reshape(-1).tofile(output_dir / "v3.bin") - gathered.reshape(-1).tofile(output_dir / "golden_v3.bin") - - -def main() -> None: - parser = argparse.ArgumentParser( - description="Generate numpy-based inputs/golden for VPTO micro-op vgather2 duplicate-index validation." - ) - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/gather-scatter/vgather2-duplicate-index/kernel.pto b/test/vpto/cases/micro-op/gather-scatter/vgather2-duplicate-index/kernel.pto deleted file mode 100644 index 41bb9d841..000000000 --- a/test/vpto/cases/micro-op/gather-scatter/vgather2-duplicate-index/kernel.pto +++ /dev/null @@ -1,59 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/gather-scatter/vgather2-duplicate-index -// family: gather-scatter -// target_ops: pto.vgather2 -// scenarios: core-f32, non-contiguous, explicit-index-pattern, load-effect-validation, no-alias -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vgather2_duplicate_index_kernel_2d(%arg0: !pto.ptr, %arg1: !pto.ptr, - %arg2: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c64 = arith.constant 64 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c128_i64 = arith.constant 128 : i64 - %c4096_i64 = arith.constant 4096 : i64 - %c8192_i64 = arith.constant 8192 : i64 - %c8_i32 = arith.constant 8 : i32 - %c1024_i32 = arith.constant 1024 : i32 - - %ub_in = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_offsets = pto.castptr %c4096_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c8192_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_in, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - pto.mte_gm_ub %arg1, %ub_offsets, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %_:1 = scf.for %offset = %c0 to %c1024 step %c64 iter_args(%remaining = %c1024_i32) -> (i32) { - %prefix_mask, %next_remaining = pto.plt_b32 %c8_i32 : i32 -> !pto.mask, i32 - %full_mask = pto.pset_b32 "PAT_ALL" : !pto.mask - %suffix_mask = pto.pnot %prefix_mask, %full_mask : !pto.mask, !pto.mask -> !pto.mask - %offsets = pto.vlds %ub_offsets[%offset] : !pto.ptr -> !pto.vreg<64xi32> - %out = pto.vgather2 %ub_in, %offsets, %suffix_mask : !pto.ptr, !pto.vreg<64xi32>, !pto.mask -> !pto.vreg<64xf32> - pto.vsts %out, %ub_out[%offset], %full_mask : !pto.vreg<64xf32>, !pto.ptr, !pto.mask - scf.yield %remaining : i32 - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg2, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/gather-scatter/vgather2-duplicate-index/launch.cpp b/test/vpto/cases/micro-op/gather-scatter/vgather2-duplicate-index/launch.cpp deleted file mode 100644 index 1a2d0359e..000000000 --- a/test/vpto/cases/micro-op/gather-scatter/vgather2-duplicate-index/launch.cpp +++ /dev/null @@ -1,72 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/gather-scatter/vgather2-duplicate-index -// family: gather-scatter -// target_ops: pto.vgather2 -// scenarios: core-f32, non-contiguous, explicit-index-pattern, load-effect-validation, no-alias -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -// --------------------------------------------------------------------------- -// PTOAS compatibility layer -// -// The upstream pto-isa headers reference some FP8/FP4 types and the -// __VEC_SCOPE__ marker that are not available on every AICore arch/toolchain -// combination (e.g. __NPU_ARCH__==2201). -// -// For our PTOAS-generated kernels we don't rely on these types today, but the -// headers still mention them in templates/static_asserts. Provide minimal -// fallbacks to keep compilation working on dav-c220. -// --------------------------------------------------------------------------- -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include - -// AICore printf support is gated behind `--cce-enable-print` on some -// toolchains. When enabled, include the CCE print header so `cce::printf` -// resolves in device compilation. -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -// Some PTO-ISA types are only available in the __CCE_AICORE__ compilation -// path, but `bisheng -xcce` still performs a host-side parse pass. -// Provide minimal fallbacks only when the corresponding header wasn't -// pulled in by the selected arch implementation. -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vgather2_duplicate_index_kernel_2d( - __gm__ float *v1, __gm__ int *v2, __gm__ float *v3); - -void LaunchVgather2_duplicate_index_kernel_2d(float *v1, int *v2, float *v3, - void *stream) { - vgather2_duplicate_index_kernel_2d<<<1, nullptr, stream>>>( - (__gm__ float *)v1, (__gm__ int *)v2, (__gm__ float *)v3); -} diff --git a/test/vpto/cases/micro-op/gather-scatter/vgather2-duplicate-index/main.cpp b/test/vpto/cases/micro-op/gather-scatter/vgather2-duplicate-index/main.cpp deleted file mode 100644 index df5907af3..000000000 --- a/test/vpto/cases/micro-op/gather-scatter/vgather2-duplicate-index/main.cpp +++ /dev/null @@ -1,141 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/gather-scatter/vgather2-duplicate-index -// family: gather-scatter -// target_ops: pto.vgather2 -// scenarios: core-f32, non-contiguous, explicit-index-pattern, load-effect-validation, no-alias -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -This program is free software, you can redistribute it and/or modify it under the terms and conditions of -CANN Open Software License Agreement Version 2.0 (the "License"). -Please refer to the License for details. You may not use this file except in compliance with the License. -THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -See LICENSE in the root of the software repository for the full text of the License. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include -#include - -using namespace PtoTestCommon; - -#ifndef TMRGSORT_HPP -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, (int)_ret, __FILE__, __LINE__); \ - const char *_recent = aclGetRecentErrMsg(); \ - if (_recent != nullptr && _recent[0] != '\0') { \ - std::fprintf(stderr, "[ERROR] RecentErrMsg: %s\n", _recent); \ - } \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVgather2_duplicate_index_kernel_2d(float *v1, int *v2, float *v3, - void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(float); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(int); - size_t elemCount_v3 = 1024; - size_t fileSize_v3 = elemCount_v3 * sizeof(float); - float *v1Host = nullptr; - float *v1Device = nullptr; - int *v2Host = nullptr; - int *v2Device = nullptr; - float *v3Host = nullptr; - float *v3Device = nullptr; - - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { - deviceId = std::atoi(envDevice); - } - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMallocHost((void **)(&v3Host), fileSize_v3)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v3Device, fileSize_v3, ACL_MEM_MALLOC_HUGE_FIRST)); - - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ReadFile("./v3.bin", fileSize_v3, v3Host, fileSize_v3); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v3Device, fileSize_v3, v3Host, fileSize_v3, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVgather2_duplicate_index_kernel_2d(v1Device, v2Device, v3Device, stream); - - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v3Host, fileSize_v3, v3Device, fileSize_v3, ACL_MEMCPY_DEVICE_TO_HOST)); - - WriteFile("./v3.bin", v3Host, fileSize_v3); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFree(v3Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - aclrtFreeHost(v3Host); - if (stream != nullptr) { - const aclError _ret = aclrtDestroyStream(stream); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtDestroyStream(stream)", (int)_ret, __FILE__, __LINE__); - } - stream = nullptr; - } - if (deviceSet) { - const aclError _ret = aclrtResetDevice(deviceId); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtResetDevice(deviceId)", (int)_ret, __FILE__, __LINE__); - } - } - if (aclInited) { - const aclError _ret = aclFinalize(); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclFinalize()", (int)_ret, __FILE__, __LINE__); - } - } - - return rc; -} diff --git a/test/vpto/cases/micro-op/gather-scatter/vgather2/kernel.pto b/test/vpto/cases/micro-op/gather-scatter/vgather2/kernel.pto index 969b7a666..38f76cd48 100644 --- a/test/vpto/cases/micro-op/gather-scatter/vgather2/kernel.pto +++ b/test/vpto/cases/micro-op/gather-scatter/vgather2/kernel.pto @@ -1,75 +1,410 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/gather-scatter/vgather2 -// family: gather-scatter -// target_ops: pto.vgather2 -// scenarios: core-f32, full-mask, non-contiguous, explicit-index-pattern, load-effect-validation, no-alias -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -// ============================================================================= -// abs_kernel_2d: Element-wise absolute value on a 32x32 f32 tile -// ============================================================================= -// This kernel computes abs(input) for a 32x32 float32 matrix (1024 elements). -// -// Memory Layout: -// - Input: arg0 -> GM (Global Memory) -// - Output: arg1 -> GM (Global Memory) -// - UB (Unified Buffer) at offset 0: input tile (4096 bytes = 32*32*4) -// - UB at offset 4096: output tile (4096 bytes = 32*32*4) -// -// Pipeline: -// 1. DMA: GM -> UB (MTE2 pipe) - copy input tile to UB -// 2. Sync: wait for MTE2 -> V pipe handoff -// 3. Compute: vabs on 64-element vectors (V pipe) - 16 iterations for 1024 elements -// 4. Sync: wait for V -> MTE3 pipe handoff -// 5. DMA: UB -> GM (MTE3 pipe) - copy result tile back to GM -// ============================================================================= - module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vgather2_kernel_2d(%arg0: !pto.ptr, %arg1: !pto.ptr, - %arg2: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c64 = arith.constant 64 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c128_i64 = arith.constant 128 : i64 - %c4096_i64 = arith.constant 4096 : i64 - %c8192_i64 = arith.constant 8192 : i64 - %c1024_i32 = arith.constant 1024 : i32 - - %ub_in = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_offsets = pto.castptr %c4096_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c8192_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_in, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) + func.func @vgather2_deep_merged_kernel(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: !pto.ptr, %arg4: !pto.ptr, %arg5: !pto.ptr) attributes {pto.kernel} { + %__deep_merge_guard = arith.constant false + // inactive merged from vgather2_duplicate_index_kernel_2d + scf.if %__deep_merge_guard { + + %c0_m0 = arith.constant 0 : index + %c64_m0 = arith.constant 64 : index + %c1024_m0 = arith.constant 1024 : index + %c0_i64_m0 = arith.constant 0 : i64 + %c1_i64_m0 = arith.constant 1 : i64 + %c32_i64_m0 = arith.constant 32 : i64 + %c128_i64_m0 = arith.constant 128 : i64 + %c4096_i64_m0 = arith.constant 4096 : i64 + %c8192_i64_m0 = arith.constant 8192 : i64 + %c8_i32_m0 = arith.constant 8 : i32 + %c1024_i32_m0 = arith.constant 1024 : i32 + + %ub_in_m0 = pto.castptr %c0_i64_m0 : i64 -> !pto.ptr + %ub_offsets_m0 = pto.castptr %c4096_i64_m0 : i64 -> !pto.ptr + %ub_out_m0 = pto.castptr %c8192_i64_m0 : i64 -> !pto.ptr + + %false_m0 = arith.constant false + pto.mte_gm_ub %arg0, %ub_in_m0, %c0_i64_m0, %c128_i64_m0 + nburst(%c32_i64_m0, %c128_i64_m0, %c128_i64_m0) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + pto.mte_gm_ub %arg1, %ub_offsets_m0, %c0_i64_m0, %c128_i64_m0 + nburst(%c32_i64_m0, %c128_i64_m0, %c128_i64_m0) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %__m0:1 = scf.for %offset_m0 = %c0_m0 to %c1024_m0 step %c64_m0 iter_args(%remaining_m0 = %c1024_i32_m0) -> (i32) { + %prefix_mask_m0, %next_remaining_m0 = pto.plt_b32 %c8_i32_m0 : i32 -> !pto.mask, i32 + %full_mask_m0 = pto.pset_b32 "PAT_ALL" : !pto.mask + %suffix_mask_m0 = pto.pnot %prefix_mask_m0, %full_mask_m0 : !pto.mask, !pto.mask -> !pto.mask + %offsets_m0 = pto.vlds %ub_offsets_m0[%offset_m0] : !pto.ptr -> !pto.vreg<64xi32> + %out_m0 = pto.vgather2 %ub_in_m0, %offsets_m0, %suffix_mask_m0 : !pto.ptr, !pto.vreg<64xi32>, !pto.mask -> !pto.vreg<64xf32> + pto.vsts %out_m0, %ub_out_m0[%offset_m0], %full_mask_m0 : !pto.vreg<64xf32>, !pto.ptr, !pto.mask + scf.yield %remaining_m0 : i32 + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m0, %arg2, %c128_i64_m0 + nburst(%c32_i64_m0, %c128_i64_m0, %c128_i64_m0) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + + } + // active merged from vgather2_kernel_2d + + %c0_m1 = arith.constant 0 : index + %c64_m1 = arith.constant 64 : index + %c1024_m1 = arith.constant 1024 : index + %c0_i64_m1 = arith.constant 0 : i64 + %c1_i64_m1 = arith.constant 1 : i64 + %c32_i64_m1 = arith.constant 32 : i64 + %c128_i64_m1 = arith.constant 128 : i64 + %c4096_i64_m1 = arith.constant 4096 : i64 + %c8192_i64_m1 = arith.constant 8192 : i64 + %c1024_i32_m1 = arith.constant 1024 : i32 + + %ub_in_m1 = pto.castptr %c0_i64_m1 : i64 -> !pto.ptr + %ub_offsets_m1 = pto.castptr %c4096_i64_m1 : i64 -> !pto.ptr + %ub_out_m1 = pto.castptr %c8192_i64_m1 : i64 -> !pto.ptr + + %false_m1 = arith.constant false + pto.mte_gm_ub %arg3, %ub_in_m1, %c0_i64_m1, %c128_i64_m1 + nburst(%c32_i64_m1, %c128_i64_m1, %c128_i64_m1) : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - pto.mte_gm_ub %arg1, %ub_offsets, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) + pto.mte_gm_ub %arg4, %ub_offsets_m1, %c0_i64_m1, %c128_i64_m1 + nburst(%c32_i64_m1, %c128_i64_m1, %c128_i64_m1) : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] pto.vecscope { - %_:1 = scf.for %offset = %c0 to %c1024 step %c64 iter_args(%remaining = %c1024_i32) -> (i32) { - %mask, %next_remaining = pto.plt_b32 %remaining : i32 -> !pto.mask, i32 - %offsets = pto.vlds %ub_offsets[%offset] : !pto.ptr -> !pto.vreg<64xi32> - %out = pto.vgather2 %ub_in, %offsets, %mask : !pto.ptr, !pto.vreg<64xi32>, !pto.mask -> !pto.vreg<64xf32> - pto.vsts %out, %ub_out[%offset], %mask : !pto.vreg<64xf32>, !pto.ptr, !pto.mask - scf.yield %next_remaining : i32 + %__m1:1 = scf.for %offset_m1 = %c0_m1 to %c1024_m1 step %c64_m1 iter_args(%remaining_m1 = %c1024_i32_m1) -> (i32) { + %mask_m1, %next_remaining_m1 = pto.plt_b32 %remaining_m1 : i32 -> !pto.mask, i32 + %offsets_m1 = pto.vlds %ub_offsets_m1[%offset_m1] : !pto.ptr -> !pto.vreg<64xi32> + %out_m1 = pto.vgather2 %ub_in_m1, %offsets_m1, %mask_m1 : !pto.ptr, !pto.vreg<64xi32>, !pto.mask -> !pto.vreg<64xf32> + pto.vsts %out_m1, %ub_out_m1[%offset_m1], %mask_m1 : !pto.vreg<64xf32>, !pto.ptr, !pto.mask + scf.yield %next_remaining_m1 : i32 } } pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg2, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) + pto.mte_ub_gm %ub_out_m1, %arg5, %c128_i64_m1 + nburst(%c32_i64_m1, %c128_i64_m1, %c128_i64_m1) : !pto.ptr, !pto.ptr, i64, i64, i64, i64 pto.barrier #pto.pipe + %__case_merge_guard = arith.constant false + + // inactive cross-case merge from micro-op/gather-scatter/vgather2_bc + scf.if %__case_merge_guard { + + %__deep_merge_guard_cmg6_1 = arith.constant false + // inactive merged from vgather2_bc_sparse_mask_kernel_2d + scf.if %__deep_merge_guard_cmg6_1 { + + %c0_m0_cmg6_1 = arith.constant 0 : index + %c64_m0_cmg6_1 = arith.constant 64 : index + %c1024_m0_cmg6_1 = arith.constant 1024 : index + %c0_i64_m0_cmg6_1 = arith.constant 0 : i64 + %c1_i64_m0_cmg6_1 = arith.constant 1 : i64 + %c32_i64_m0_cmg6_1 = arith.constant 32 : i64 + %c128_i64_m0_cmg6_1 = arith.constant 128 : i64 + %c64_i32_m0_cmg6_1 = arith.constant 64 : i32 + %c4096_i64_m0_cmg6_1 = arith.constant 4096 : i64 + %c8192_i64_m0_cmg6_1 = arith.constant 8192 : i64 + %c1024_i32_m0_cmg6_1 = arith.constant 1024 : i32 + + %ub_in_m0_cmg6_1 = pto.castptr %c0_i64_m0_cmg6_1 : i64 -> !pto.ptr + %ub_offsets_m0_cmg6_1 = pto.castptr %c4096_i64_m0_cmg6_1 : i64 -> !pto.ptr + %ub_out_m0_cmg6_1 = pto.castptr %c8192_i64_m0_cmg6_1 : i64 -> !pto.ptr + + %false_m0_cmg6_1 = arith.constant false + pto.mte_gm_ub %arg0, %ub_in_m0_cmg6_1, %c0_i64_m0_cmg6_1, %c128_i64_m0_cmg6_1 + nburst(%c32_i64_m0_cmg6_1, %c128_i64_m0_cmg6_1, %c128_i64_m0_cmg6_1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + pto.mte_gm_ub %arg1, %ub_offsets_m0_cmg6_1, %c0_i64_m0_cmg6_1, %c128_i64_m0_cmg6_1 + nburst(%c32_i64_m0_cmg6_1, %c128_i64_m0_cmg6_1, %c128_i64_m0_cmg6_1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %__m0_cmg6_1:1 = scf.for %offset_m0_cmg6_1 = %c0_m0_cmg6_1 to %c1024_m0_cmg6_1 step %c64_m0_cmg6_1 iter_args(%remaining_m0_cmg6_1 = %c1024_i32_m0_cmg6_1) -> (i32) { + %full_mask_m0_cmg6_1, %next_remaining_m0_cmg6_1 = pto.plt_b32 %remaining_m0_cmg6_1 : i32 -> !pto.mask, i32 + %offsets_m0_cmg6_1 = pto.vlds %ub_offsets_m0_cmg6_1[%offset_m0_cmg6_1] : !pto.ptr -> !pto.vreg<64xi32> + %gather_mask_m0_cmg6_1 = pto.vcmps %offsets_m0_cmg6_1, %c64_i32_m0_cmg6_1, %full_mask_m0_cmg6_1, "lt" : !pto.vreg<64xi32>, i32, !pto.mask -> !pto.mask + %out_m0_cmg6_1 = pto.vgather2_bc %ub_in_m0_cmg6_1, %offsets_m0_cmg6_1, %gather_mask_m0_cmg6_1 : !pto.ptr, !pto.vreg<64xi32>, !pto.mask -> !pto.vreg<64xf32> + pto.vsts %out_m0_cmg6_1, %ub_out_m0_cmg6_1[%offset_m0_cmg6_1], %full_mask_m0_cmg6_1 : !pto.vreg<64xf32>, !pto.ptr, !pto.mask + scf.yield %next_remaining_m0_cmg6_1 : i32 + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m0_cmg6_1, %arg2, %c128_i64_m0_cmg6_1 + nburst(%c32_i64_m0_cmg6_1, %c128_i64_m0_cmg6_1, %c128_i64_m0_cmg6_1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + + } + // active merged from vgather2_bc_kernel_2d + + %c0_m1_cmg6_1 = arith.constant 0 : index + %c64_m1_cmg6_1 = arith.constant 64 : index + %c1024_m1_cmg6_1 = arith.constant 1024 : index + %c0_i64_m1_cmg6_1 = arith.constant 0 : i64 + %c1_i64_m1_cmg6_1 = arith.constant 1 : i64 + %c32_i64_m1_cmg6_1 = arith.constant 32 : i64 + %c128_i64_m1_cmg6_1 = arith.constant 128 : i64 + %c256_i32_m1_cmg6_1 = arith.constant 256 : i32 + %c4096_i64_m1_cmg6_1 = arith.constant 4096 : i64 + %c8192_i64_m1_cmg6_1 = arith.constant 8192 : i64 + %c1024_i32_m1_cmg6_1 = arith.constant 1024 : i32 + + %ub_in_m1_cmg6_1 = pto.castptr %c0_i64_m1_cmg6_1 : i64 -> !pto.ptr + %ub_offsets_m1_cmg6_1 = pto.castptr %c4096_i64_m1_cmg6_1 : i64 -> !pto.ptr + %ub_out_m1_cmg6_1 = pto.castptr %c8192_i64_m1_cmg6_1 : i64 -> !pto.ptr + + %false_m1_cmg6_1 = arith.constant false + pto.mte_gm_ub %arg3, %ub_in_m1_cmg6_1, %c0_i64_m1_cmg6_1, %c128_i64_m1_cmg6_1 + nburst(%c32_i64_m1_cmg6_1, %c128_i64_m1_cmg6_1, %c128_i64_m1_cmg6_1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + pto.mte_gm_ub %arg4, %ub_offsets_m1_cmg6_1, %c0_i64_m1_cmg6_1, %c128_i64_m1_cmg6_1 + nburst(%c32_i64_m1_cmg6_1, %c128_i64_m1_cmg6_1, %c128_i64_m1_cmg6_1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %__m1_cmg6_1:1 = scf.for %offset_m1_cmg6_1 = %c0_m1_cmg6_1 to %c1024_m1_cmg6_1 step %c64_m1_cmg6_1 iter_args(%remaining_m1_cmg6_1 = %c1024_i32_m1_cmg6_1) -> (i32) { + %full_mask_m1_cmg6_1, %next_remaining_m1_cmg6_1 = pto.plt_b32 %remaining_m1_cmg6_1 : i32 -> !pto.mask, i32 + %offsets_m1_cmg6_1 = pto.vlds %ub_offsets_m1_cmg6_1[%offset_m1_cmg6_1] : !pto.ptr -> !pto.vreg<64xi32> + %gather_mask_m1_cmg6_1 = pto.vcmps %offsets_m1_cmg6_1, %c256_i32_m1_cmg6_1, %full_mask_m1_cmg6_1, "lt" : !pto.vreg<64xi32>, i32, !pto.mask -> !pto.mask + %out_m1_cmg6_1 = pto.vgather2_bc %ub_in_m1_cmg6_1, %offsets_m1_cmg6_1, %gather_mask_m1_cmg6_1 : !pto.ptr, !pto.vreg<64xi32>, !pto.mask -> !pto.vreg<64xf32> + pto.vsts %out_m1_cmg6_1, %ub_out_m1_cmg6_1[%offset_m1_cmg6_1], %full_mask_m1_cmg6_1 : !pto.vreg<64xf32>, !pto.ptr, !pto.mask + scf.yield %next_remaining_m1_cmg6_1 : i32 + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m1_cmg6_1, %arg5, %c128_i64_m1_cmg6_1 + nburst(%c32_i64_m1_cmg6_1, %c128_i64_m1_cmg6_1, %c128_i64_m1_cmg6_1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + } + + // inactive cross-case merge from micro-op/gather-scatter/vgatherb + scf.if %__case_merge_guard { + + %__deep_merge_guard_cmg6_2 = arith.constant false + // inactive merged from vgatherb_block_boundary_kernel_2d + scf.if %__deep_merge_guard_cmg6_2 { + + %c0_m0_cmg6_2 = arith.constant 0 : index + %c64_m0_cmg6_2 = arith.constant 64 : index + %c1024_m0_cmg6_2 = arith.constant 1024 : index + %c8_i32_m0_cmg6_2 = arith.constant 8 : i32 + %c0_i64_m0_cmg6_2 = arith.constant 0 : i64 + %c1_i64_m0_cmg6_2 = arith.constant 1 : i64 + %c32_i64_m0_cmg6_2 = arith.constant 32 : i64 + %c128_i64_m0_cmg6_2 = arith.constant 128 : i64 + %c4096_i64_m0_cmg6_2 = arith.constant 4096 : i64 + %c8192_i64_m0_cmg6_2 = arith.constant 8192 : i64 + %c1024_i32_m0_cmg6_2 = arith.constant 1024 : i32 + + %ub_in_m0_cmg6_2 = pto.castptr %c0_i64_m0_cmg6_2 : i64 -> !pto.ptr + %ub_offsets_m0_cmg6_2 = pto.castptr %c4096_i64_m0_cmg6_2 : i64 -> !pto.ptr + %ub_out_m0_cmg6_2 = pto.castptr %c8192_i64_m0_cmg6_2 : i64 -> !pto.ptr + + %false_m0_cmg6_2 = arith.constant false + pto.mte_gm_ub %arg0, %ub_in_m0_cmg6_2, %c0_i64_m0_cmg6_2, %c128_i64_m0_cmg6_2 + nburst(%c32_i64_m0_cmg6_2, %c128_i64_m0_cmg6_2, %c128_i64_m0_cmg6_2) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + pto.mte_gm_ub %arg1, %ub_offsets_m0_cmg6_2, %c0_i64_m0_cmg6_2, %c128_i64_m0_cmg6_2 + nburst(%c32_i64_m0_cmg6_2, %c128_i64_m0_cmg6_2, %c128_i64_m0_cmg6_2) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %__m0_cmg6_2:1 = scf.for %offset_m0_cmg6_2 = %c0_m0_cmg6_2 to %c1024_m0_cmg6_2 step %c64_m0_cmg6_2 iter_args(%remaining_m0_cmg6_2 = %c1024_i32_m0_cmg6_2) -> (i32) { + %full_mask_m0_cmg6_2, %next_remaining_m0_cmg6_2 = pto.plt_b32 %remaining_m0_cmg6_2 : i32 -> !pto.mask, i32 + %gather_mask_m0_cmg6_2, %_tail_m0_cmg6_2 = pto.plt_b32 %c8_i32_m0_cmg6_2 : i32 -> !pto.mask, i32 + %offsets_m0_cmg6_2 = pto.vlds %ub_offsets_m0_cmg6_2[%offset_m0_cmg6_2] : !pto.ptr -> !pto.vreg<64xi32> + %out_m0_cmg6_2 = pto.vgatherb %ub_in_m0_cmg6_2, %offsets_m0_cmg6_2, %gather_mask_m0_cmg6_2 : !pto.ptr, !pto.vreg<64xi32>, !pto.mask -> !pto.vreg<64xf32> + pto.vsts %out_m0_cmg6_2, %ub_out_m0_cmg6_2[%offset_m0_cmg6_2], %full_mask_m0_cmg6_2 : !pto.vreg<64xf32>, !pto.ptr, !pto.mask + scf.yield %next_remaining_m0_cmg6_2 : i32 + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m0_cmg6_2, %arg2, %c128_i64_m0_cmg6_2 + nburst(%c32_i64_m0_cmg6_2, %c128_i64_m0_cmg6_2, %c128_i64_m0_cmg6_2) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + + } + // active merged from vgatherb_kernel_2d + + %c0_m1_cmg6_2 = arith.constant 0 : index + %c64_m1_cmg6_2 = arith.constant 64 : index + %c1024_m1_cmg6_2 = arith.constant 1024 : index + %c8_i32_m1_cmg6_2 = arith.constant 8 : i32 + %c0_i64_m1_cmg6_2 = arith.constant 0 : i64 + %c1_i64_m1_cmg6_2 = arith.constant 1 : i64 + %c32_i64_m1_cmg6_2 = arith.constant 32 : i64 + %c128_i64_m1_cmg6_2 = arith.constant 128 : i64 + %c4096_i64_m1_cmg6_2 = arith.constant 4096 : i64 + %c8192_i64_m1_cmg6_2 = arith.constant 8192 : i64 + %c1024_i32_m1_cmg6_2 = arith.constant 1024 : i32 + + %ub_in_m1_cmg6_2 = pto.castptr %c0_i64_m1_cmg6_2 : i64 -> !pto.ptr + %ub_offsets_m1_cmg6_2 = pto.castptr %c4096_i64_m1_cmg6_2 : i64 -> !pto.ptr + %ub_out_m1_cmg6_2 = pto.castptr %c8192_i64_m1_cmg6_2 : i64 -> !pto.ptr + + %false_m1_cmg6_2 = arith.constant false + pto.mte_gm_ub %arg3, %ub_in_m1_cmg6_2, %c0_i64_m1_cmg6_2, %c128_i64_m1_cmg6_2 + nburst(%c32_i64_m1_cmg6_2, %c128_i64_m1_cmg6_2, %c128_i64_m1_cmg6_2) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + pto.mte_gm_ub %arg4, %ub_offsets_m1_cmg6_2, %c0_i64_m1_cmg6_2, %c128_i64_m1_cmg6_2 + nburst(%c32_i64_m1_cmg6_2, %c128_i64_m1_cmg6_2, %c128_i64_m1_cmg6_2) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %__m1_cmg6_2:1 = scf.for %offset_m1_cmg6_2 = %c0_m1_cmg6_2 to %c1024_m1_cmg6_2 step %c64_m1_cmg6_2 iter_args(%remaining_m1_cmg6_2 = %c1024_i32_m1_cmg6_2) -> (i32) { + %full_mask_m1_cmg6_2, %next_remaining_m1_cmg6_2 = pto.plt_b32 %remaining_m1_cmg6_2 : i32 -> !pto.mask, i32 + %gather_mask_m1_cmg6_2, %_tail_m1_cmg6_2 = pto.plt_b32 %c8_i32_m1_cmg6_2 : i32 -> !pto.mask, i32 + %offsets_m1_cmg6_2 = pto.vlds %ub_offsets_m1_cmg6_2[%offset_m1_cmg6_2] : !pto.ptr -> !pto.vreg<64xi32> + %out_m1_cmg6_2 = pto.vgatherb %ub_in_m1_cmg6_2, %offsets_m1_cmg6_2, %gather_mask_m1_cmg6_2 : !pto.ptr, !pto.vreg<64xi32>, !pto.mask -> !pto.vreg<64xf32> + pto.vsts %out_m1_cmg6_2, %ub_out_m1_cmg6_2[%offset_m1_cmg6_2], %full_mask_m1_cmg6_2 : !pto.vreg<64xf32>, !pto.ptr, !pto.mask + scf.yield %next_remaining_m1_cmg6_2 : i32 + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m1_cmg6_2, %arg5, %c128_i64_m1_cmg6_2 + nburst(%c32_i64_m1_cmg6_2, %c128_i64_m1_cmg6_2, %c128_i64_m1_cmg6_2) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + } + + // inactive cross-case merge from micro-op/gather-scatter/vscatter + scf.if %__case_merge_guard { + + %__deep_merge_guard_cmg6_3 = arith.constant false + // inactive merged from vscatter_out_of_order_index_kernel_2d + scf.if %__deep_merge_guard_cmg6_3 { + + %c0_m0_cmg6_3 = arith.constant 0 : index + %c64_m0_cmg6_3 = arith.constant 64 : index + %c1024_m0_cmg6_3 = arith.constant 1024 : index + %c0_i64_m0_cmg6_3 = arith.constant 0 : i64 + %c1_i64_m0_cmg6_3 = arith.constant 1 : i64 + %c32_i64_m0_cmg6_3 = arith.constant 32 : i64 + %c128_i64_m0_cmg6_3 = arith.constant 128 : i64 + %c4096_i64_m0_cmg6_3 = arith.constant 4096 : i64 + %c8192_i64_m0_cmg6_3 = arith.constant 8192 : i64 + %c8_i32_m0_cmg6_3 = arith.constant 8 : i32 + %c1024_i32_m0_cmg6_3 = arith.constant 1024 : i32 + + %ub_in_m0_cmg6_3 = pto.castptr %c0_i64_m0_cmg6_3 : i64 -> !pto.ptr + %ub_offsets_m0_cmg6_3 = pto.castptr %c4096_i64_m0_cmg6_3 : i64 -> !pto.ptr + %ub_out_m0_cmg6_3 = pto.castptr %c8192_i64_m0_cmg6_3 : i64 -> !pto.ptr + + %false_m0_cmg6_3 = arith.constant false + pto.mte_gm_ub %arg0, %ub_in_m0_cmg6_3, %c0_i64_m0_cmg6_3, %c128_i64_m0_cmg6_3 + nburst(%c32_i64_m0_cmg6_3, %c128_i64_m0_cmg6_3, %c128_i64_m0_cmg6_3) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + pto.mte_gm_ub %arg1, %ub_offsets_m0_cmg6_3, %c0_i64_m0_cmg6_3, %c128_i64_m0_cmg6_3 + nburst(%c32_i64_m0_cmg6_3, %c128_i64_m0_cmg6_3, %c128_i64_m0_cmg6_3) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + pto.mte_gm_ub %arg2, %ub_out_m0_cmg6_3, %c0_i64_m0_cmg6_3, %c128_i64_m0_cmg6_3 + nburst(%c32_i64_m0_cmg6_3, %c128_i64_m0_cmg6_3, %c128_i64_m0_cmg6_3) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %__m0_cmg6_3:1 = scf.for %offset_m0_cmg6_3 = %c0_m0_cmg6_3 to %c1024_m0_cmg6_3 step %c64_m0_cmg6_3 iter_args(%remaining_m0_cmg6_3 = %c1024_i32_m0_cmg6_3) -> (i32) { + %prefix_mask_m0_cmg6_3, %next_remaining_m0_cmg6_3 = pto.plt_b32 %c8_i32_m0_cmg6_3 : i32 -> !pto.mask, i32 + %full_mask_m0_cmg6_3 = pto.pset_b32 "PAT_ALL" : !pto.mask + %suffix_mask_m0_cmg6_3 = pto.pnot %prefix_mask_m0_cmg6_3, %full_mask_m0_cmg6_3 : !pto.mask, !pto.mask -> !pto.mask + %vec_m0_cmg6_3 = pto.vlds %ub_in_m0_cmg6_3[%offset_m0_cmg6_3] : !pto.ptr -> !pto.vreg<64xf32> + %offsets_m0_cmg6_3 = pto.vlds %ub_offsets_m0_cmg6_3[%offset_m0_cmg6_3] : !pto.ptr -> !pto.vreg<64xi32> + pto.vscatter %vec_m0_cmg6_3, %ub_out_m0_cmg6_3, %offsets_m0_cmg6_3, %suffix_mask_m0_cmg6_3 : !pto.vreg<64xf32>, !pto.ptr, !pto.vreg<64xi32>, !pto.mask + scf.yield %remaining_m0_cmg6_3 : i32 + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m0_cmg6_3, %arg2, %c128_i64_m0_cmg6_3 + nburst(%c32_i64_m0_cmg6_3, %c128_i64_m0_cmg6_3, %c128_i64_m0_cmg6_3) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + + } + // active merged from vscatter_kernel_2d + + %c0_m1_cmg6_3 = arith.constant 0 : index + %c64_m1_cmg6_3 = arith.constant 64 : index + %c1024_m1_cmg6_3 = arith.constant 1024 : index + %c0_i64_m1_cmg6_3 = arith.constant 0 : i64 + %c1_i64_m1_cmg6_3 = arith.constant 1 : i64 + %c32_i64_m1_cmg6_3 = arith.constant 32 : i64 + %c128_i64_m1_cmg6_3 = arith.constant 128 : i64 + %c4096_i64_m1_cmg6_3 = arith.constant 4096 : i64 + %c8192_i64_m1_cmg6_3 = arith.constant 8192 : i64 + %c1024_i32_m1_cmg6_3 = arith.constant 1024 : i32 + + %ub_in_m1_cmg6_3 = pto.castptr %c0_i64_m1_cmg6_3 : i64 -> !pto.ptr + %ub_offsets_m1_cmg6_3 = pto.castptr %c4096_i64_m1_cmg6_3 : i64 -> !pto.ptr + %ub_out_m1_cmg6_3 = pto.castptr %c8192_i64_m1_cmg6_3 : i64 -> !pto.ptr + + %false_m1_cmg6_3 = arith.constant false + pto.mte_gm_ub %arg3, %ub_in_m1_cmg6_3, %c0_i64_m1_cmg6_3, %c128_i64_m1_cmg6_3 + nburst(%c32_i64_m1_cmg6_3, %c128_i64_m1_cmg6_3, %c128_i64_m1_cmg6_3) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + pto.mte_gm_ub %arg4, %ub_offsets_m1_cmg6_3, %c0_i64_m1_cmg6_3, %c128_i64_m1_cmg6_3 + nburst(%c32_i64_m1_cmg6_3, %c128_i64_m1_cmg6_3, %c128_i64_m1_cmg6_3) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + pto.mte_gm_ub %arg5, %ub_out_m1_cmg6_3, %c0_i64_m1_cmg6_3, %c128_i64_m1_cmg6_3 + nburst(%c32_i64_m1_cmg6_3, %c128_i64_m1_cmg6_3, %c128_i64_m1_cmg6_3) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %__m1_cmg6_3:1 = scf.for %offset_m1_cmg6_3 = %c0_m1_cmg6_3 to %c1024_m1_cmg6_3 step %c64_m1_cmg6_3 iter_args(%remaining_m1_cmg6_3 = %c1024_i32_m1_cmg6_3) -> (i32) { + %mask_m1_cmg6_3, %next_remaining_m1_cmg6_3 = pto.plt_b32 %remaining_m1_cmg6_3 : i32 -> !pto.mask, i32 + %vec_m1_cmg6_3 = pto.vlds %ub_in_m1_cmg6_3[%offset_m1_cmg6_3] : !pto.ptr -> !pto.vreg<64xf32> + %offsets_m1_cmg6_3 = pto.vlds %ub_offsets_m1_cmg6_3[%offset_m1_cmg6_3] : !pto.ptr -> !pto.vreg<64xi32> + pto.vscatter %vec_m1_cmg6_3, %ub_out_m1_cmg6_3, %offsets_m1_cmg6_3, %mask_m1_cmg6_3 : !pto.vreg<64xf32>, !pto.ptr, !pto.vreg<64xi32>, !pto.mask + scf.yield %next_remaining_m1_cmg6_3 : i32 + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m1_cmg6_3, %arg5, %c128_i64_m1_cmg6_3 + nburst(%c32_i64_m1_cmg6_3, %c128_i64_m1_cmg6_3, %c128_i64_m1_cmg6_3) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + } + return } } diff --git a/test/vpto/cases/micro-op/gather-scatter/vgather2/launch.cpp b/test/vpto/cases/micro-op/gather-scatter/vgather2/launch.cpp index e99c6741a..cef378fec 100644 --- a/test/vpto/cases/micro-op/gather-scatter/vgather2/launch.cpp +++ b/test/vpto/cases/micro-op/gather-scatter/vgather2/launch.cpp @@ -5,30 +5,9 @@ // THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, // INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. // See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/gather-scatter/vgather2 -// family: gather-scatter -// target_ops: pto.vgather2 -// scenarios: core-f32, full-mask, non-contiguous, explicit-index-pattern, load-effect-validation, no-alias -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -// --------------------------------------------------------------------------- -// PTOAS compatibility layer -// -// The upstream pto-isa headers reference some FP8/FP4 types and the -// __VEC_SCOPE__ marker that are not available on every AICore arch/toolchain -// combination (e.g. __NPU_ARCH__==2201). -// -// For our PTOAS-generated kernels we don't rely on these types today, but the -// headers still mention them in templates/static_asserts. Provide minimal -// fallbacks to keep compilation working on dav-c220. -// --------------------------------------------------------------------------- #ifndef __VEC_SCOPE__ #define __VEC_SCOPE__ #endif - #if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) typedef struct { unsigned char v; } hifloat8_t; typedef struct { unsigned char v; } float8_e4m3_t; @@ -38,36 +17,30 @@ typedef struct { unsigned char v; } float4_e1m2x2_t; typedef struct { unsigned char v; } float4_e2m1x2_t; #endif #include - -// AICore printf support is gated behind `--cce-enable-print` on some -// toolchains. When enabled, include the CCE print header so `cce::printf` -// resolves in device compilation. #if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) #include #endif - -// Some PTO-ISA types are only available in the __CCE_AICORE__ compilation -// path, but `bisheng -xcce` still performs a host-side parse pass. -// Provide minimal fallbacks only when the corresponding header wasn't -// pulled in by the selected arch implementation. #if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; +struct MrgSortExecutedNumList { uint16_t mrgSortList0,mrgSortList1,mrgSortList2,mrgSortList3; }; #endif #ifndef __CPU_SIM #include "acl/acl.h" #endif -extern "C" __global__ [aicore] void vgather2_kernel_2d(__gm__ float *v1, - __gm__ int *v2, - __gm__ float *v3); +extern "C" __global__ [aicore] void vgather2_deep_merged_kernel( + __gm__ float * arg0, + __gm__ int32_t * arg1, + __gm__ float * arg2, + __gm__ float * arg3, + __gm__ int32_t * arg4, + __gm__ float * arg5); -void LaunchVgather2_kernel_2d(float *v1, int *v2, float *v3, void *stream) { - vgather2_kernel_2d<<<1, nullptr, stream>>>((__gm__ float *)v1, - (__gm__ int *)v2, - (__gm__ float *)v3); +void LaunchVgather2DeepMerged(float * p0, int * p1, float * p2, void *stream) { + vgather2_deep_merged_kernel<<<1, nullptr, stream>>>( + (__gm__ float *)p0, + (__gm__ int32_t *)p0, + (__gm__ float *)p0, + (__gm__ float *)p0, + (__gm__ int32_t *)p1, + (__gm__ float *)p2); } diff --git a/test/vpto/cases/micro-op/gather-scatter/vgather2/main.cpp b/test/vpto/cases/micro-op/gather-scatter/vgather2/main.cpp index e2a9b4804..872f07de4 100644 --- a/test/vpto/cases/micro-op/gather-scatter/vgather2/main.cpp +++ b/test/vpto/cases/micro-op/gather-scatter/vgather2/main.cpp @@ -55,8 +55,8 @@ struct MrgSortExecutedNumList { } \ } while (0) -void LaunchVgather2_kernel_2d(float *v1, int *v2, float *v3, void *stream); +void LaunchVgather2DeepMerged(float * p0, int * p1, float * p2, void *stream); int main() { size_t elemCount_v1 = 1024; size_t fileSize_v1 = elemCount_v1 * sizeof(float); @@ -99,7 +99,7 @@ int main() { ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, ACL_MEMCPY_HOST_TO_DEVICE)); ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, ACL_MEMCPY_HOST_TO_DEVICE)); ACL_CHECK(aclrtMemcpy(v3Device, fileSize_v3, v3Host, fileSize_v3, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVgather2_kernel_2d(v1Device, v2Device, v3Device, stream); + LaunchVgather2DeepMerged(v1Device, v2Device, v3Device, stream); ACL_CHECK(aclrtSynchronizeStream(stream)); ACL_CHECK(aclrtMemcpy(v3Host, fileSize_v3, v3Device, fileSize_v3, ACL_MEMCPY_DEVICE_TO_HOST)); diff --git a/test/vpto/cases/micro-op/gather-scatter/vgather2_bc-sparse-mask/compare.py b/test/vpto/cases/micro-op/gather-scatter/vgather2_bc-sparse-mask/compare.py deleted file mode 100755 index 83f25f4ab..000000000 --- a/test/vpto/cases/micro-op/gather-scatter/vgather2_bc-sparse-mask/compare.py +++ /dev/null @@ -1,209 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/gather-scatter/vgather2_bc-sparse-mask -# family: gather-scatter -# target_ops: pto.vgather2_bc -# scenarios: core-f32, masked-gather, load-effect-validation, no-alias -# NOTE: bulk-generated coverage skeleton. -# coding=utf-8 - -import os -import sys -import numpy as np - - -REPEAT_BYTES = 256 - - -def _ceil_div(x, y): - return (x + y - 1) // y - - -def _packed_pred_storage_bytes(logical_elems, src_elem_bytes): - logical_elems = int(logical_elems) - src_elem_bytes = int(src_elem_bytes) - if logical_elems <= 0: - raise ValueError(f"logical_elems must be > 0, got {logical_elems}") - if src_elem_bytes not in (1, 2, 4): - raise ValueError(f"unsupported packed predicate source size: {src_elem_bytes}") - - repeat_elems = REPEAT_BYTES // src_elem_bytes - if src_elem_bytes == 4: - repeat_times = _ceil_div(logical_elems, repeat_elems) + 1 - loop_count = repeat_times // 2 - return loop_count * 16 - - repeat_times = _ceil_div(logical_elems, repeat_elems) - return repeat_times * (repeat_elems // 8) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - g = golden.astype(np.int64, copy=False) - o = output.astype(np.int64, copy=False) - else: - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - abs_diff = np.abs(g - o) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={g[idx]}, out={o[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_bin_prefix(golden_path, output_path, dtype, eps, count): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - try: - count = int(count) - except Exception: - print(f"[ERROR] Invalid prefix count: {count}") - return False - if count <= 0: - print(f"[ERROR] Invalid prefix count: {count}") - return False - - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np, count=count) - output = np.fromfile(output_path, dtype=dtype_np, count=count) - - if golden.size != count or output.size != count: - print( - f"[ERROR] Prefix read too small: need={count} elems, " - f"golden={golden.size}, out={output.size}" - ) - return False - - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - g = golden.astype(np.int64, copy=False) - o = output.astype(np.int64, copy=False) - else: - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - abs_diff = np.abs(g - o) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch (prefix): {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={g[idx]}, out={o[idx]}, dtype={dtype_np}, count={count})" - ) - else: - print(f"[ERROR] Mismatch (prefix): {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, logical_elems, src_elem_bytes): - """ - Compare outputs of pto.tcmp / pto.tcmps. - - PTO-ISA stores packed predicate results as a linear PK byte stream via - `psts`, with the exact written prefix length determined by the typed - TCMP/TCMPS repeat schedule. Compare only that semantic prefix. - """ - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - try: - logical_elems = int(logical_elems) - src_elem_bytes = int(src_elem_bytes) - except Exception: - print( - "[ERROR] Invalid packed mask compare arguments: " - f"logical_elems={logical_elems} src_elem_bytes={src_elem_bytes}" - ) - return False - if logical_elems <= 0 or src_elem_bytes <= 0: - print( - "[ERROR] Invalid packed mask compare arguments: " - f"logical_elems={logical_elems} src_elem_bytes={src_elem_bytes}" - ) - return False - - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - try: - prefix_bytes = _packed_pred_storage_bytes(logical_elems, src_elem_bytes) - except ValueError as exc: - print(f"[ERROR] {exc}") - return False - - if golden.size < prefix_bytes or output.size < prefix_bytes: - print( - f"[ERROR] Packed mask buffer too small: need={prefix_bytes} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - - golden_sel = golden[:prefix_bytes] - output_sel = output[:prefix_bytes] - - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, " - f"logical_elems={logical_elems}, src_elem_bytes={src_elem_bytes}, prefix_bytes={prefix_bytes})" - ) - return False - return True - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = True - ok = compare_bin("golden_v3.bin", "v3.bin", np.float32, 0.0001) and ok - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/gather-scatter/vgather2_bc-sparse-mask/golden.py b/test/vpto/cases/micro-op/gather-scatter/vgather2_bc-sparse-mask/golden.py deleted file mode 100755 index e0cea7841..000000000 --- a/test/vpto/cases/micro-op/gather-scatter/vgather2_bc-sparse-mask/golden.py +++ /dev/null @@ -1,57 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/gather-scatter/vgather2_bc-sparse-mask -# family: gather-scatter -# target_ops: pto.vgather2_bc -# scenarios: core-f32, masked-gather, load-effect-validation, no-alias -# NOTE: bulk-generated coverage skeleton. -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -ROWS = 32 -COLS = 32 -SEED = 19 - - -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) - flat = rng.uniform(-8.0, 8.0, size=(ROWS * COLS,)).astype(np.float32) - offsets = ((np.arange(ROWS * COLS, dtype=np.int32) * 17) + 3) % (ROWS * COLS) - gathered = np.zeros((ROWS * COLS,), dtype=np.float32) - active = offsets < 64 - gathered[active] = flat[offsets[active]] - v1 = flat.reshape(ROWS, COLS) - v2 = offsets.reshape(ROWS, COLS) - v3 = np.zeros((ROWS, COLS), dtype=np.float32) - - output_dir.mkdir(parents=True, exist_ok=True) - v1.reshape(-1).tofile(output_dir / "v1.bin") - v2.reshape(-1).tofile(output_dir / "v2.bin") - v3.reshape(-1).tofile(output_dir / "v3.bin") - gathered.reshape(-1).tofile(output_dir / "golden_v3.bin") - - -def main() -> None: - parser = argparse.ArgumentParser( - description="Generate numpy-based inputs/golden for VPTO micro-op vgather2_bc sparse-mask validation." - ) - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/gather-scatter/vgather2_bc-sparse-mask/kernel.pto b/test/vpto/cases/micro-op/gather-scatter/vgather2_bc-sparse-mask/kernel.pto deleted file mode 100644 index b37d1de0e..000000000 --- a/test/vpto/cases/micro-op/gather-scatter/vgather2_bc-sparse-mask/kernel.pto +++ /dev/null @@ -1,58 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/gather-scatter/vgather2_bc-sparse-mask -// family: gather-scatter -// target_ops: pto.vgather2_bc -// scenarios: core-f32, masked-gather, load-effect-validation, no-alias -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vgather2_bc_sparse_mask_kernel_2d(%arg0: !pto.ptr, %arg1: !pto.ptr, - %arg2: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c64 = arith.constant 64 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c128_i64 = arith.constant 128 : i64 - %c64_i32 = arith.constant 64 : i32 - %c4096_i64 = arith.constant 4096 : i64 - %c8192_i64 = arith.constant 8192 : i64 - %c1024_i32 = arith.constant 1024 : i32 - - %ub_in = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_offsets = pto.castptr %c4096_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c8192_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_in, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - pto.mte_gm_ub %arg1, %ub_offsets, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %_:1 = scf.for %offset = %c0 to %c1024 step %c64 iter_args(%remaining = %c1024_i32) -> (i32) { - %full_mask, %next_remaining = pto.plt_b32 %remaining : i32 -> !pto.mask, i32 - %offsets = pto.vlds %ub_offsets[%offset] : !pto.ptr -> !pto.vreg<64xi32> - %gather_mask = pto.vcmps %offsets, %c64_i32, %full_mask, "lt" : !pto.vreg<64xi32>, i32, !pto.mask -> !pto.mask - %out = pto.vgather2_bc %ub_in, %offsets, %gather_mask : !pto.ptr, !pto.vreg<64xi32>, !pto.mask -> !pto.vreg<64xf32> - pto.vsts %out, %ub_out[%offset], %full_mask : !pto.vreg<64xf32>, !pto.ptr, !pto.mask - scf.yield %next_remaining : i32 - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg2, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/gather-scatter/vgather2_bc-sparse-mask/launch.cpp b/test/vpto/cases/micro-op/gather-scatter/vgather2_bc-sparse-mask/launch.cpp deleted file mode 100644 index 333288162..000000000 --- a/test/vpto/cases/micro-op/gather-scatter/vgather2_bc-sparse-mask/launch.cpp +++ /dev/null @@ -1,72 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/gather-scatter/vgather2_bc-sparse-mask -// family: gather-scatter -// target_ops: pto.vgather2_bc -// scenarios: core-f32, masked-gather, load-effect-validation, no-alias -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -// --------------------------------------------------------------------------- -// PTOAS compatibility layer -// -// The upstream pto-isa headers reference some FP8/FP4 types and the -// __VEC_SCOPE__ marker that are not available on every AICore arch/toolchain -// combination (e.g. __NPU_ARCH__==2201). -// -// For our PTOAS-generated kernels we don't rely on these types today, but the -// headers still mention them in templates/static_asserts. Provide minimal -// fallbacks to keep compilation working on dav-c220. -// --------------------------------------------------------------------------- -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include - -// AICore printf support is gated behind `--cce-enable-print` on some -// toolchains. When enabled, include the CCE print header so `cce::printf` -// resolves in device compilation. -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -// Some PTO-ISA types are only available in the __CCE_AICORE__ compilation -// path, but `bisheng -xcce` still performs a host-side parse pass. -// Provide minimal fallbacks only when the corresponding header wasn't -// pulled in by the selected arch implementation. -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vgather2_bc_sparse_mask_kernel_2d( - __gm__ float *v1, __gm__ int *v2, __gm__ float *v3); - -void LaunchVgather2_bc_sparse_mask_kernel_2d(float *v1, int *v2, float *v3, - void *stream) { - vgather2_bc_sparse_mask_kernel_2d<<<1, nullptr, stream>>>( - (__gm__ float *)v1, (__gm__ int *)v2, (__gm__ float *)v3); -} diff --git a/test/vpto/cases/micro-op/gather-scatter/vgather2_bc-sparse-mask/main.cpp b/test/vpto/cases/micro-op/gather-scatter/vgather2_bc-sparse-mask/main.cpp deleted file mode 100644 index 66ab70307..000000000 --- a/test/vpto/cases/micro-op/gather-scatter/vgather2_bc-sparse-mask/main.cpp +++ /dev/null @@ -1,141 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/gather-scatter/vgather2_bc-sparse-mask -// family: gather-scatter -// target_ops: pto.vgather2_bc -// scenarios: core-f32, masked-gather, load-effect-validation, no-alias -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -This program is free software, you can redistribute it and/or modify it under the terms and conditions of -CANN Open Software License Agreement Version 2.0 (the "License"). -Please refer to the License for details. You may not use this file except in compliance with the License. -THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -See LICENSE in the root of the software repository for the full text of the License. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include -#include - -using namespace PtoTestCommon; - -#ifndef TMRGSORT_HPP -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, (int)_ret, __FILE__, __LINE__); \ - const char *_recent = aclGetRecentErrMsg(); \ - if (_recent != nullptr && _recent[0] != '\0') { \ - std::fprintf(stderr, "[ERROR] RecentErrMsg: %s\n", _recent); \ - } \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVgather2_bc_sparse_mask_kernel_2d(float *v1, int *v2, float *v3, - void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(float); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(int); - size_t elemCount_v3 = 1024; - size_t fileSize_v3 = elemCount_v3 * sizeof(float); - float *v1Host = nullptr; - float *v1Device = nullptr; - int *v2Host = nullptr; - int *v2Device = nullptr; - float *v3Host = nullptr; - float *v3Device = nullptr; - - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { - deviceId = std::atoi(envDevice); - } - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMallocHost((void **)(&v3Host), fileSize_v3)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v3Device, fileSize_v3, ACL_MEM_MALLOC_HUGE_FIRST)); - - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ReadFile("./v3.bin", fileSize_v3, v3Host, fileSize_v3); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v3Device, fileSize_v3, v3Host, fileSize_v3, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVgather2_bc_sparse_mask_kernel_2d(v1Device, v2Device, v3Device, stream); - - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v3Host, fileSize_v3, v3Device, fileSize_v3, ACL_MEMCPY_DEVICE_TO_HOST)); - - WriteFile("./v3.bin", v3Host, fileSize_v3); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFree(v3Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - aclrtFreeHost(v3Host); - if (stream != nullptr) { - const aclError _ret = aclrtDestroyStream(stream); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtDestroyStream(stream)", (int)_ret, __FILE__, __LINE__); - } - stream = nullptr; - } - if (deviceSet) { - const aclError _ret = aclrtResetDevice(deviceId); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtResetDevice(deviceId)", (int)_ret, __FILE__, __LINE__); - } - } - if (aclInited) { - const aclError _ret = aclFinalize(); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclFinalize()", (int)_ret, __FILE__, __LINE__); - } - } - - return rc; -} diff --git a/test/vpto/cases/micro-op/gather-scatter/vgather2_bc/compare.py b/test/vpto/cases/micro-op/gather-scatter/vgather2_bc/compare.py deleted file mode 100755 index 4ebeae5d2..000000000 --- a/test/vpto/cases/micro-op/gather-scatter/vgather2_bc/compare.py +++ /dev/null @@ -1,209 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/gather-scatter/vgather2_bc -# family: gather-scatter -# target_ops: pto.vgather2_bc -# scenarios: core-f32, full-mask, non-contiguous, masked-gather, load-effect-validation, no-alias -# NOTE: bulk-generated coverage skeleton. -# coding=utf-8 - -import os -import sys -import numpy as np - - -REPEAT_BYTES = 256 - - -def _ceil_div(x, y): - return (x + y - 1) // y - - -def _packed_pred_storage_bytes(logical_elems, src_elem_bytes): - logical_elems = int(logical_elems) - src_elem_bytes = int(src_elem_bytes) - if logical_elems <= 0: - raise ValueError(f"logical_elems must be > 0, got {logical_elems}") - if src_elem_bytes not in (1, 2, 4): - raise ValueError(f"unsupported packed predicate source size: {src_elem_bytes}") - - repeat_elems = REPEAT_BYTES // src_elem_bytes - if src_elem_bytes == 4: - repeat_times = _ceil_div(logical_elems, repeat_elems) + 1 - loop_count = repeat_times // 2 - return loop_count * 16 - - repeat_times = _ceil_div(logical_elems, repeat_elems) - return repeat_times * (repeat_elems // 8) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - g = golden.astype(np.int64, copy=False) - o = output.astype(np.int64, copy=False) - else: - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - abs_diff = np.abs(g - o) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={g[idx]}, out={o[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_bin_prefix(golden_path, output_path, dtype, eps, count): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - try: - count = int(count) - except Exception: - print(f"[ERROR] Invalid prefix count: {count}") - return False - if count <= 0: - print(f"[ERROR] Invalid prefix count: {count}") - return False - - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np, count=count) - output = np.fromfile(output_path, dtype=dtype_np, count=count) - - if golden.size != count or output.size != count: - print( - f"[ERROR] Prefix read too small: need={count} elems, " - f"golden={golden.size}, out={output.size}" - ) - return False - - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - g = golden.astype(np.int64, copy=False) - o = output.astype(np.int64, copy=False) - else: - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - abs_diff = np.abs(g - o) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch (prefix): {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={g[idx]}, out={o[idx]}, dtype={dtype_np}, count={count})" - ) - else: - print(f"[ERROR] Mismatch (prefix): {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, logical_elems, src_elem_bytes): - """ - Compare outputs of pto.tcmp / pto.tcmps. - - PTO-ISA stores packed predicate results as a linear PK byte stream via - `psts`, with the exact written prefix length determined by the typed - TCMP/TCMPS repeat schedule. Compare only that semantic prefix. - """ - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - try: - logical_elems = int(logical_elems) - src_elem_bytes = int(src_elem_bytes) - except Exception: - print( - "[ERROR] Invalid packed mask compare arguments: " - f"logical_elems={logical_elems} src_elem_bytes={src_elem_bytes}" - ) - return False - if logical_elems <= 0 or src_elem_bytes <= 0: - print( - "[ERROR] Invalid packed mask compare arguments: " - f"logical_elems={logical_elems} src_elem_bytes={src_elem_bytes}" - ) - return False - - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - try: - prefix_bytes = _packed_pred_storage_bytes(logical_elems, src_elem_bytes) - except ValueError as exc: - print(f"[ERROR] {exc}") - return False - - if golden.size < prefix_bytes or output.size < prefix_bytes: - print( - f"[ERROR] Packed mask buffer too small: need={prefix_bytes} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - - golden_sel = golden[:prefix_bytes] - output_sel = output[:prefix_bytes] - - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, " - f"logical_elems={logical_elems}, src_elem_bytes={src_elem_bytes}, prefix_bytes={prefix_bytes})" - ) - return False - return True - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = True - ok = compare_bin("golden_v3.bin", "v3.bin", np.float32, 0.0001) and ok - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/gather-scatter/vgather2_bc/golden.py b/test/vpto/cases/micro-op/gather-scatter/vgather2_bc/golden.py deleted file mode 100755 index da03fb5a7..000000000 --- a/test/vpto/cases/micro-op/gather-scatter/vgather2_bc/golden.py +++ /dev/null @@ -1,55 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/gather-scatter/vgather2_bc -# family: gather-scatter -# target_ops: pto.vgather2_bc -# scenarios: core-f32, full-mask, non-contiguous, masked-gather, load-effect-validation, no-alias -# NOTE: bulk-generated coverage skeleton. -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -ROWS = 32 -COLS = 32 -SEED = 19 -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) - flat = rng.uniform(-8.0, 8.0, size=(ROWS * COLS,)).astype(np.float32) - offsets = ((np.arange(ROWS * COLS, dtype=np.int32) * 17) + 3) % (ROWS * COLS) - gathered = np.zeros((ROWS * COLS,), dtype=np.float32) - active = offsets < 256 - gathered[active] = flat[offsets[active]] - v1 = flat.reshape(ROWS, COLS) - v2 = offsets.reshape(ROWS, COLS) - v3 = np.zeros((ROWS, COLS), dtype=np.float32) - - output_dir.mkdir(parents=True, exist_ok=True) - v1.reshape(-1).tofile(output_dir / "v1.bin") - v2.reshape(-1).tofile(output_dir / "v2.bin") - v3.reshape(-1).tofile(output_dir / "v3.bin") - gathered.reshape(-1).tofile(output_dir / "golden_v3.bin") - - -def main() -> None: - parser = argparse.ArgumentParser( - description="Generate numpy-based inputs/golden for VPTO micro-op vgather2_bc validation." - ) - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/gather-scatter/vgather2_bc/kernel.pto b/test/vpto/cases/micro-op/gather-scatter/vgather2_bc/kernel.pto deleted file mode 100644 index 826bba01e..000000000 --- a/test/vpto/cases/micro-op/gather-scatter/vgather2_bc/kernel.pto +++ /dev/null @@ -1,58 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/gather-scatter/vgather2_bc -// family: gather-scatter -// target_ops: pto.vgather2_bc -// scenarios: core-f32, full-mask, non-contiguous, masked-gather, load-effect-validation, no-alias -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vgather2_bc_kernel_2d(%arg0: !pto.ptr, %arg1: !pto.ptr, - %arg2: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c64 = arith.constant 64 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c128_i64 = arith.constant 128 : i64 - %c256_i32 = arith.constant 256 : i32 - %c4096_i64 = arith.constant 4096 : i64 - %c8192_i64 = arith.constant 8192 : i64 - %c1024_i32 = arith.constant 1024 : i32 - - %ub_in = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_offsets = pto.castptr %c4096_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c8192_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_in, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - pto.mte_gm_ub %arg1, %ub_offsets, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %_:1 = scf.for %offset = %c0 to %c1024 step %c64 iter_args(%remaining = %c1024_i32) -> (i32) { - %full_mask, %next_remaining = pto.plt_b32 %remaining : i32 -> !pto.mask, i32 - %offsets = pto.vlds %ub_offsets[%offset] : !pto.ptr -> !pto.vreg<64xi32> - %gather_mask = pto.vcmps %offsets, %c256_i32, %full_mask, "lt" : !pto.vreg<64xi32>, i32, !pto.mask -> !pto.mask - %out = pto.vgather2_bc %ub_in, %offsets, %gather_mask : !pto.ptr, !pto.vreg<64xi32>, !pto.mask -> !pto.vreg<64xf32> - pto.vsts %out, %ub_out[%offset], %full_mask : !pto.vreg<64xf32>, !pto.ptr, !pto.mask - scf.yield %next_remaining : i32 - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg2, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/gather-scatter/vgather2_bc/launch.cpp b/test/vpto/cases/micro-op/gather-scatter/vgather2_bc/launch.cpp deleted file mode 100644 index 2c60a591c..000000000 --- a/test/vpto/cases/micro-op/gather-scatter/vgather2_bc/launch.cpp +++ /dev/null @@ -1,73 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/gather-scatter/vgather2_bc -// family: gather-scatter -// target_ops: pto.vgather2_bc -// scenarios: core-f32, full-mask, non-contiguous, masked-gather, load-effect-validation, no-alias -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -// --------------------------------------------------------------------------- -// PTOAS compatibility layer -// -// The upstream pto-isa headers reference some FP8/FP4 types and the -// __VEC_SCOPE__ marker that are not available on every AICore arch/toolchain -// combination (e.g. __NPU_ARCH__==2201). -// -// For our PTOAS-generated kernels we don't rely on these types today, but the -// headers still mention them in templates/static_asserts. Provide minimal -// fallbacks to keep compilation working on dav-c220. -// --------------------------------------------------------------------------- -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include - -// AICore printf support is gated behind `--cce-enable-print` on some -// toolchains. When enabled, include the CCE print header so `cce::printf` -// resolves in device compilation. -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -// Some PTO-ISA types are only available in the __CCE_AICORE__ compilation -// path, but `bisheng -xcce` still performs a host-side parse pass. -// Provide minimal fallbacks only when the corresponding header wasn't -// pulled in by the selected arch implementation. -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vgather2_bc_kernel_2d(__gm__ float *v1, - __gm__ int *v2, - __gm__ float *v3); - -void LaunchVgather2_bc_kernel_2d(float *v1, int *v2, float *v3, void *stream) { - vgather2_bc_kernel_2d<<<1, nullptr, stream>>>((__gm__ float *)v1, - (__gm__ int *)v2, - (__gm__ float *)v3); -} diff --git a/test/vpto/cases/micro-op/gather-scatter/vgather2_bc/main.cpp b/test/vpto/cases/micro-op/gather-scatter/vgather2_bc/main.cpp deleted file mode 100644 index 73ba0e412..000000000 --- a/test/vpto/cases/micro-op/gather-scatter/vgather2_bc/main.cpp +++ /dev/null @@ -1,140 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/gather-scatter/vgather2_bc -// family: gather-scatter -// target_ops: pto.vgather2_bc -// scenarios: core-f32, full-mask, non-contiguous, masked-gather, load-effect-validation, no-alias -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -This program is free software, you can redistribute it and/or modify it under the terms and conditions of -CANN Open Software License Agreement Version 2.0 (the "License"). -Please refer to the License for details. You may not use this file except in compliance with the License. -THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -See LICENSE in the root of the software repository for the full text of the License. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include -#include - -using namespace PtoTestCommon; - -#ifndef TMRGSORT_HPP -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, (int)_ret, __FILE__, __LINE__); \ - const char *_recent = aclGetRecentErrMsg(); \ - if (_recent != nullptr && _recent[0] != '\0') { \ - std::fprintf(stderr, "[ERROR] RecentErrMsg: %s\n", _recent); \ - } \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVgather2_bc_kernel_2d(float *v1, int *v2, float *v3, void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(float); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(int); - size_t elemCount_v3 = 1024; - size_t fileSize_v3 = elemCount_v3 * sizeof(float); - float *v1Host = nullptr; - float *v1Device = nullptr; - int *v2Host = nullptr; - int *v2Device = nullptr; - float *v3Host = nullptr; - float *v3Device = nullptr; - - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { - deviceId = std::atoi(envDevice); - } - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMallocHost((void **)(&v3Host), fileSize_v3)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v3Device, fileSize_v3, ACL_MEM_MALLOC_HUGE_FIRST)); - - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ReadFile("./v3.bin", fileSize_v3, v3Host, fileSize_v3); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v3Device, fileSize_v3, v3Host, fileSize_v3, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVgather2_bc_kernel_2d(v1Device, v2Device, v3Device, stream); - - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v3Host, fileSize_v3, v3Device, fileSize_v3, ACL_MEMCPY_DEVICE_TO_HOST)); - - WriteFile("./v3.bin", v3Host, fileSize_v3); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFree(v3Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - aclrtFreeHost(v3Host); - if (stream != nullptr) { - const aclError _ret = aclrtDestroyStream(stream); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtDestroyStream(stream)", (int)_ret, __FILE__, __LINE__); - } - stream = nullptr; - } - if (deviceSet) { - const aclError _ret = aclrtResetDevice(deviceId); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtResetDevice(deviceId)", (int)_ret, __FILE__, __LINE__); - } - } - if (aclInited) { - const aclError _ret = aclFinalize(); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclFinalize()", (int)_ret, __FILE__, __LINE__); - } - } - - return rc; -} diff --git a/test/vpto/cases/micro-op/gather-scatter/vgatherb-block-boundary/compare.py b/test/vpto/cases/micro-op/gather-scatter/vgatherb-block-boundary/compare.py deleted file mode 100755 index 6d777d58f..000000000 --- a/test/vpto/cases/micro-op/gather-scatter/vgatherb-block-boundary/compare.py +++ /dev/null @@ -1,209 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/gather-scatter/vgatherb-block-boundary -# family: gather-scatter -# target_ops: pto.vgatherb -# scenarios: core-f32, block-gather, aligned-base, load-effect-validation, no-alias -# NOTE: bulk-generated coverage skeleton. -# coding=utf-8 - -import os -import sys -import numpy as np - - -REPEAT_BYTES = 256 - - -def _ceil_div(x, y): - return (x + y - 1) // y - - -def _packed_pred_storage_bytes(logical_elems, src_elem_bytes): - logical_elems = int(logical_elems) - src_elem_bytes = int(src_elem_bytes) - if logical_elems <= 0: - raise ValueError(f"logical_elems must be > 0, got {logical_elems}") - if src_elem_bytes not in (1, 2, 4): - raise ValueError(f"unsupported packed predicate source size: {src_elem_bytes}") - - repeat_elems = REPEAT_BYTES // src_elem_bytes - if src_elem_bytes == 4: - repeat_times = _ceil_div(logical_elems, repeat_elems) + 1 - loop_count = repeat_times // 2 - return loop_count * 16 - - repeat_times = _ceil_div(logical_elems, repeat_elems) - return repeat_times * (repeat_elems // 8) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - g = golden.astype(np.int64, copy=False) - o = output.astype(np.int64, copy=False) - else: - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - abs_diff = np.abs(g - o) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={g[idx]}, out={o[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_bin_prefix(golden_path, output_path, dtype, eps, count): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - try: - count = int(count) - except Exception: - print(f"[ERROR] Invalid prefix count: {count}") - return False - if count <= 0: - print(f"[ERROR] Invalid prefix count: {count}") - return False - - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np, count=count) - output = np.fromfile(output_path, dtype=dtype_np, count=count) - - if golden.size != count or output.size != count: - print( - f"[ERROR] Prefix read too small: need={count} elems, " - f"golden={golden.size}, out={output.size}" - ) - return False - - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - g = golden.astype(np.int64, copy=False) - o = output.astype(np.int64, copy=False) - else: - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - abs_diff = np.abs(g - o) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch (prefix): {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={g[idx]}, out={o[idx]}, dtype={dtype_np}, count={count})" - ) - else: - print(f"[ERROR] Mismatch (prefix): {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, logical_elems, src_elem_bytes): - """ - Compare outputs of pto.tcmp / pto.tcmps. - - PTO-ISA stores packed predicate results as a linear PK byte stream via - `psts`, with the exact written prefix length determined by the typed - TCMP/TCMPS repeat schedule. Compare only that semantic prefix. - """ - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - try: - logical_elems = int(logical_elems) - src_elem_bytes = int(src_elem_bytes) - except Exception: - print( - "[ERROR] Invalid packed mask compare arguments: " - f"logical_elems={logical_elems} src_elem_bytes={src_elem_bytes}" - ) - return False - if logical_elems <= 0 or src_elem_bytes <= 0: - print( - "[ERROR] Invalid packed mask compare arguments: " - f"logical_elems={logical_elems} src_elem_bytes={src_elem_bytes}" - ) - return False - - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - try: - prefix_bytes = _packed_pred_storage_bytes(logical_elems, src_elem_bytes) - except ValueError as exc: - print(f"[ERROR] {exc}") - return False - - if golden.size < prefix_bytes or output.size < prefix_bytes: - print( - f"[ERROR] Packed mask buffer too small: need={prefix_bytes} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - - golden_sel = golden[:prefix_bytes] - output_sel = output[:prefix_bytes] - - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, " - f"logical_elems={logical_elems}, src_elem_bytes={src_elem_bytes}, prefix_bytes={prefix_bytes})" - ) - return False - return True - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = True - ok = compare_bin("golden_v3.bin", "v3.bin", np.float32, 0.0001) and ok - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/gather-scatter/vgatherb-block-boundary/golden.py b/test/vpto/cases/micro-op/gather-scatter/vgatherb-block-boundary/golden.py deleted file mode 100755 index 2bfb0c0d3..000000000 --- a/test/vpto/cases/micro-op/gather-scatter/vgatherb-block-boundary/golden.py +++ /dev/null @@ -1,66 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/gather-scatter/vgatherb-block-boundary -# family: gather-scatter -# target_ops: pto.vgatherb -# scenarios: core-f32, block-gather, aligned-base, load-effect-validation, no-alias -# NOTE: bulk-generated coverage skeleton. -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -ROWS = 32 -COLS = 32 -BLOCK_FLOATS = 8 -BLOCKS_PER_ITER = 8 -ITER_ELEMS = 64 -SEED = 19 - - -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) - flat = rng.uniform(-8.0, 8.0, size=(ROWS * COLS,)).astype(np.float32) - blocks = flat.reshape(-1, BLOCK_FLOATS) - offsets = np.zeros((ROWS * COLS,), dtype=np.int32) - gathered = np.zeros((ROWS * COLS,), dtype=np.float32) - boundary_patterns = np.array([0, 1, 15, 16, 31, 32, 63, 127], dtype=np.int32) - - for chunk in range((ROWS * COLS) // ITER_ELEMS): - block_ids = (boundary_patterns + chunk * 3) % blocks.shape[0] - offsets[chunk * ITER_ELEMS:chunk * ITER_ELEMS + BLOCKS_PER_ITER] = block_ids * 32 - gathered[chunk * ITER_ELEMS:(chunk + 1) * ITER_ELEMS] = blocks[block_ids].reshape(-1) - - v1 = flat.reshape(ROWS, COLS) - v2 = offsets.reshape(ROWS, COLS) - v3 = np.zeros((ROWS, COLS), dtype=np.float32) - - output_dir.mkdir(parents=True, exist_ok=True) - v1.reshape(-1).tofile(output_dir / "v1.bin") - v2.reshape(-1).tofile(output_dir / "v2.bin") - v3.reshape(-1).tofile(output_dir / "v3.bin") - gathered.reshape(-1).tofile(output_dir / "golden_v3.bin") - - -def main() -> None: - parser = argparse.ArgumentParser( - description="Generate numpy-based inputs/golden for VPTO micro-op vgatherb block-boundary validation." - ) - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/gather-scatter/vgatherb-block-boundary/kernel.pto b/test/vpto/cases/micro-op/gather-scatter/vgatherb-block-boundary/kernel.pto deleted file mode 100644 index e44003903..000000000 --- a/test/vpto/cases/micro-op/gather-scatter/vgatherb-block-boundary/kernel.pto +++ /dev/null @@ -1,58 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/gather-scatter/vgatherb-block-boundary -// family: gather-scatter -// target_ops: pto.vgatherb -// scenarios: core-f32, block-gather, aligned-base, load-effect-validation, no-alias -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vgatherb_block_boundary_kernel_2d(%arg0: !pto.ptr, %arg1: !pto.ptr, - %arg2: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c64 = arith.constant 64 : index - %c1024 = arith.constant 1024 : index - %c8_i32 = arith.constant 8 : i32 - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c128_i64 = arith.constant 128 : i64 - %c4096_i64 = arith.constant 4096 : i64 - %c8192_i64 = arith.constant 8192 : i64 - %c1024_i32 = arith.constant 1024 : i32 - - %ub_in = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_offsets = pto.castptr %c4096_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c8192_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_in, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - pto.mte_gm_ub %arg1, %ub_offsets, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %_:1 = scf.for %offset = %c0 to %c1024 step %c64 iter_args(%remaining = %c1024_i32) -> (i32) { - %full_mask, %next_remaining = pto.plt_b32 %remaining : i32 -> !pto.mask, i32 - %gather_mask, %_tail = pto.plt_b32 %c8_i32 : i32 -> !pto.mask, i32 - %offsets = pto.vlds %ub_offsets[%offset] : !pto.ptr -> !pto.vreg<64xi32> - %out = pto.vgatherb %ub_in, %offsets, %gather_mask : !pto.ptr, !pto.vreg<64xi32>, !pto.mask -> !pto.vreg<64xf32> - pto.vsts %out, %ub_out[%offset], %full_mask : !pto.vreg<64xf32>, !pto.ptr, !pto.mask - scf.yield %next_remaining : i32 - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg2, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/gather-scatter/vgatherb-block-boundary/launch.cpp b/test/vpto/cases/micro-op/gather-scatter/vgatherb-block-boundary/launch.cpp deleted file mode 100644 index fb6f40c39..000000000 --- a/test/vpto/cases/micro-op/gather-scatter/vgatherb-block-boundary/launch.cpp +++ /dev/null @@ -1,72 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/gather-scatter/vgatherb-block-boundary -// family: gather-scatter -// target_ops: pto.vgatherb -// scenarios: core-f32, block-gather, aligned-base, load-effect-validation, no-alias -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -// --------------------------------------------------------------------------- -// PTOAS compatibility layer -// -// The upstream pto-isa headers reference some FP8/FP4 types and the -// __VEC_SCOPE__ marker that are not available on every AICore arch/toolchain -// combination (e.g. __NPU_ARCH__==2201). -// -// For our PTOAS-generated kernels we don't rely on these types today, but the -// headers still mention them in templates/static_asserts. Provide minimal -// fallbacks to keep compilation working on dav-c220. -// --------------------------------------------------------------------------- -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include - -// AICore printf support is gated behind `--cce-enable-print` on some -// toolchains. When enabled, include the CCE print header so `cce::printf` -// resolves in device compilation. -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -// Some PTO-ISA types are only available in the __CCE_AICORE__ compilation -// path, but `bisheng -xcce` still performs a host-side parse pass. -// Provide minimal fallbacks only when the corresponding header wasn't -// pulled in by the selected arch implementation. -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vgatherb_block_boundary_kernel_2d( - __gm__ float *v1, __gm__ int *v2, __gm__ float *v3); - -void LaunchVgatherb_block_boundary_kernel_2d(float *v1, int *v2, float *v3, - void *stream) { - vgatherb_block_boundary_kernel_2d<<<1, nullptr, stream>>>( - (__gm__ float *)v1, (__gm__ int *)v2, (__gm__ float *)v3); -} diff --git a/test/vpto/cases/micro-op/gather-scatter/vgatherb-block-boundary/main.cpp b/test/vpto/cases/micro-op/gather-scatter/vgatherb-block-boundary/main.cpp deleted file mode 100644 index 77c2cb46f..000000000 --- a/test/vpto/cases/micro-op/gather-scatter/vgatherb-block-boundary/main.cpp +++ /dev/null @@ -1,141 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/gather-scatter/vgatherb-block-boundary -// family: gather-scatter -// target_ops: pto.vgatherb -// scenarios: core-f32, block-gather, aligned-base, load-effect-validation, no-alias -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -This program is free software, you can redistribute it and/or modify it under the terms and conditions of -CANN Open Software License Agreement Version 2.0 (the "License"). -Please refer to the License for details. You may not use this file except in compliance with the License. -THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -See LICENSE in the root of the software repository for the full text of the License. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include -#include - -using namespace PtoTestCommon; - -#ifndef TMRGSORT_HPP -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, (int)_ret, __FILE__, __LINE__); \ - const char *_recent = aclGetRecentErrMsg(); \ - if (_recent != nullptr && _recent[0] != '\0') { \ - std::fprintf(stderr, "[ERROR] RecentErrMsg: %s\n", _recent); \ - } \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVgatherb_block_boundary_kernel_2d(float *v1, int *v2, float *v3, - void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(float); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(int); - size_t elemCount_v3 = 1024; - size_t fileSize_v3 = elemCount_v3 * sizeof(float); - float *v1Host = nullptr; - float *v1Device = nullptr; - int *v2Host = nullptr; - int *v2Device = nullptr; - float *v3Host = nullptr; - float *v3Device = nullptr; - - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { - deviceId = std::atoi(envDevice); - } - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMallocHost((void **)(&v3Host), fileSize_v3)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v3Device, fileSize_v3, ACL_MEM_MALLOC_HUGE_FIRST)); - - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ReadFile("./v3.bin", fileSize_v3, v3Host, fileSize_v3); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v3Device, fileSize_v3, v3Host, fileSize_v3, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVgatherb_block_boundary_kernel_2d(v1Device, v2Device, v3Device, stream); - - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v3Host, fileSize_v3, v3Device, fileSize_v3, ACL_MEMCPY_DEVICE_TO_HOST)); - - WriteFile("./v3.bin", v3Host, fileSize_v3); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFree(v3Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - aclrtFreeHost(v3Host); - if (stream != nullptr) { - const aclError _ret = aclrtDestroyStream(stream); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtDestroyStream(stream)", (int)_ret, __FILE__, __LINE__); - } - stream = nullptr; - } - if (deviceSet) { - const aclError _ret = aclrtResetDevice(deviceId); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtResetDevice(deviceId)", (int)_ret, __FILE__, __LINE__); - } - } - if (aclInited) { - const aclError _ret = aclFinalize(); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclFinalize()", (int)_ret, __FILE__, __LINE__); - } - } - - return rc; -} diff --git a/test/vpto/cases/micro-op/gather-scatter/vgatherb/compare.py b/test/vpto/cases/micro-op/gather-scatter/vgatherb/compare.py deleted file mode 100755 index e9d439e1a..000000000 --- a/test/vpto/cases/micro-op/gather-scatter/vgatherb/compare.py +++ /dev/null @@ -1,209 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/gather-scatter/vgatherb -# family: gather-scatter -# target_ops: pto.vgatherb -# scenarios: core-f32, full-mask, block-gather, aligned-base, load-effect-validation, no-alias -# NOTE: bulk-generated coverage skeleton. -# coding=utf-8 - -import os -import sys -import numpy as np - - -REPEAT_BYTES = 256 - - -def _ceil_div(x, y): - return (x + y - 1) // y - - -def _packed_pred_storage_bytes(logical_elems, src_elem_bytes): - logical_elems = int(logical_elems) - src_elem_bytes = int(src_elem_bytes) - if logical_elems <= 0: - raise ValueError(f"logical_elems must be > 0, got {logical_elems}") - if src_elem_bytes not in (1, 2, 4): - raise ValueError(f"unsupported packed predicate source size: {src_elem_bytes}") - - repeat_elems = REPEAT_BYTES // src_elem_bytes - if src_elem_bytes == 4: - repeat_times = _ceil_div(logical_elems, repeat_elems) + 1 - loop_count = repeat_times // 2 - return loop_count * 16 - - repeat_times = _ceil_div(logical_elems, repeat_elems) - return repeat_times * (repeat_elems // 8) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - g = golden.astype(np.int64, copy=False) - o = output.astype(np.int64, copy=False) - else: - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - abs_diff = np.abs(g - o) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={g[idx]}, out={o[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_bin_prefix(golden_path, output_path, dtype, eps, count): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - try: - count = int(count) - except Exception: - print(f"[ERROR] Invalid prefix count: {count}") - return False - if count <= 0: - print(f"[ERROR] Invalid prefix count: {count}") - return False - - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np, count=count) - output = np.fromfile(output_path, dtype=dtype_np, count=count) - - if golden.size != count or output.size != count: - print( - f"[ERROR] Prefix read too small: need={count} elems, " - f"golden={golden.size}, out={output.size}" - ) - return False - - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - g = golden.astype(np.int64, copy=False) - o = output.astype(np.int64, copy=False) - else: - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - abs_diff = np.abs(g - o) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch (prefix): {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={g[idx]}, out={o[idx]}, dtype={dtype_np}, count={count})" - ) - else: - print(f"[ERROR] Mismatch (prefix): {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, logical_elems, src_elem_bytes): - """ - Compare outputs of pto.tcmp / pto.tcmps. - - PTO-ISA stores packed predicate results as a linear PK byte stream via - `psts`, with the exact written prefix length determined by the typed - TCMP/TCMPS repeat schedule. Compare only that semantic prefix. - """ - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - try: - logical_elems = int(logical_elems) - src_elem_bytes = int(src_elem_bytes) - except Exception: - print( - "[ERROR] Invalid packed mask compare arguments: " - f"logical_elems={logical_elems} src_elem_bytes={src_elem_bytes}" - ) - return False - if logical_elems <= 0 or src_elem_bytes <= 0: - print( - "[ERROR] Invalid packed mask compare arguments: " - f"logical_elems={logical_elems} src_elem_bytes={src_elem_bytes}" - ) - return False - - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - try: - prefix_bytes = _packed_pred_storage_bytes(logical_elems, src_elem_bytes) - except ValueError as exc: - print(f"[ERROR] {exc}") - return False - - if golden.size < prefix_bytes or output.size < prefix_bytes: - print( - f"[ERROR] Packed mask buffer too small: need={prefix_bytes} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - - golden_sel = golden[:prefix_bytes] - output_sel = output[:prefix_bytes] - - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, " - f"logical_elems={logical_elems}, src_elem_bytes={src_elem_bytes}, prefix_bytes={prefix_bytes})" - ) - return False - return True - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = True - ok = compare_bin("golden_v3.bin", "v3.bin", np.float32, 0.0001) and ok - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/gather-scatter/vgatherb/golden.py b/test/vpto/cases/micro-op/gather-scatter/vgatherb/golden.py deleted file mode 100755 index e102cecfe..000000000 --- a/test/vpto/cases/micro-op/gather-scatter/vgatherb/golden.py +++ /dev/null @@ -1,66 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/gather-scatter/vgatherb -# family: gather-scatter -# target_ops: pto.vgatherb -# scenarios: core-f32, full-mask, block-gather, aligned-base, load-effect-validation, no-alias -# NOTE: bulk-generated coverage skeleton. -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -ROWS = 32 -COLS = 32 -BLOCK_FLOATS = 8 -BLOCKS_PER_ITER = 8 -ITER_ELEMS = 64 -SEED = 19 -OUT_SENTINEL = np.float32(-123.25) - - -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) - flat = rng.uniform(-8.0, 8.0, size=(ROWS * COLS,)).astype(np.float32) - blocks = flat.reshape(-1, BLOCK_FLOATS) - offsets = np.zeros((ROWS * COLS,), dtype=np.int32) - gathered = np.full((ROWS * COLS,), OUT_SENTINEL, dtype=np.float32) - - for chunk in range((ROWS * COLS) // ITER_ELEMS): - block_ids = ((np.arange(BLOCKS_PER_ITER, dtype=np.int32) + chunk * 11) * 7 + 3) % blocks.shape[0] - offsets[chunk * ITER_ELEMS:chunk * ITER_ELEMS + BLOCKS_PER_ITER] = block_ids * 32 - gathered[chunk * ITER_ELEMS:(chunk + 1) * ITER_ELEMS] = blocks[block_ids].reshape(-1) - - v1 = flat.reshape(ROWS, COLS) - v2 = offsets.reshape(ROWS, COLS) - v3 = np.full((ROWS, COLS), OUT_SENTINEL, dtype=np.float32) - - output_dir.mkdir(parents=True, exist_ok=True) - v1.reshape(-1).tofile(output_dir / "v1.bin") - v2.reshape(-1).tofile(output_dir / "v2.bin") - v3.reshape(-1).tofile(output_dir / "v3.bin") - gathered.reshape(-1).tofile(output_dir / "golden_v3.bin") - - -def main() -> None: - parser = argparse.ArgumentParser( - description="Generate numpy-based inputs/golden for VPTO micro-op vgatherb validation." - ) - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/gather-scatter/vgatherb/kernel.pto b/test/vpto/cases/micro-op/gather-scatter/vgatherb/kernel.pto deleted file mode 100644 index edfcfad54..000000000 --- a/test/vpto/cases/micro-op/gather-scatter/vgatherb/kernel.pto +++ /dev/null @@ -1,58 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/gather-scatter/vgatherb -// family: gather-scatter -// target_ops: pto.vgatherb -// scenarios: core-f32, full-mask, block-gather, aligned-base, load-effect-validation, no-alias -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vgatherb_kernel_2d(%arg0: !pto.ptr, %arg1: !pto.ptr, - %arg2: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c64 = arith.constant 64 : index - %c1024 = arith.constant 1024 : index - %c8_i32 = arith.constant 8 : i32 - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c128_i64 = arith.constant 128 : i64 - %c4096_i64 = arith.constant 4096 : i64 - %c8192_i64 = arith.constant 8192 : i64 - %c1024_i32 = arith.constant 1024 : i32 - - %ub_in = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_offsets = pto.castptr %c4096_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c8192_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_in, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - pto.mte_gm_ub %arg1, %ub_offsets, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %_:1 = scf.for %offset = %c0 to %c1024 step %c64 iter_args(%remaining = %c1024_i32) -> (i32) { - %full_mask, %next_remaining = pto.plt_b32 %remaining : i32 -> !pto.mask, i32 - %gather_mask, %_tail = pto.plt_b32 %c8_i32 : i32 -> !pto.mask, i32 - %offsets = pto.vlds %ub_offsets[%offset] : !pto.ptr -> !pto.vreg<64xi32> - %out = pto.vgatherb %ub_in, %offsets, %gather_mask : !pto.ptr, !pto.vreg<64xi32>, !pto.mask -> !pto.vreg<64xf32> - pto.vsts %out, %ub_out[%offset], %full_mask : !pto.vreg<64xf32>, !pto.ptr, !pto.mask - scf.yield %next_remaining : i32 - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg2, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/gather-scatter/vgatherb/launch.cpp b/test/vpto/cases/micro-op/gather-scatter/vgatherb/launch.cpp deleted file mode 100644 index 589f236be..000000000 --- a/test/vpto/cases/micro-op/gather-scatter/vgatherb/launch.cpp +++ /dev/null @@ -1,73 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/gather-scatter/vgatherb -// family: gather-scatter -// target_ops: pto.vgatherb -// scenarios: core-f32, full-mask, block-gather, aligned-base, load-effect-validation, no-alias -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -// --------------------------------------------------------------------------- -// PTOAS compatibility layer -// -// The upstream pto-isa headers reference some FP8/FP4 types and the -// __VEC_SCOPE__ marker that are not available on every AICore arch/toolchain -// combination (e.g. __NPU_ARCH__==2201). -// -// For our PTOAS-generated kernels we don't rely on these types today, but the -// headers still mention them in templates/static_asserts. Provide minimal -// fallbacks to keep compilation working on dav-c220. -// --------------------------------------------------------------------------- -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include - -// AICore printf support is gated behind `--cce-enable-print` on some -// toolchains. When enabled, include the CCE print header so `cce::printf` -// resolves in device compilation. -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -// Some PTO-ISA types are only available in the __CCE_AICORE__ compilation -// path, but `bisheng -xcce` still performs a host-side parse pass. -// Provide minimal fallbacks only when the corresponding header wasn't -// pulled in by the selected arch implementation. -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vgatherb_kernel_2d(__gm__ float *v1, - __gm__ int *v2, - __gm__ float *v3); - -void LaunchVgatherb_kernel_2d(float *v1, int *v2, float *v3, void *stream) { - vgatherb_kernel_2d<<<1, nullptr, stream>>>((__gm__ float *)v1, - (__gm__ int *)v2, - (__gm__ float *)v3); -} diff --git a/test/vpto/cases/micro-op/gather-scatter/vgatherb/main.cpp b/test/vpto/cases/micro-op/gather-scatter/vgatherb/main.cpp deleted file mode 100644 index e16952c96..000000000 --- a/test/vpto/cases/micro-op/gather-scatter/vgatherb/main.cpp +++ /dev/null @@ -1,140 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/gather-scatter/vgatherb -// family: gather-scatter -// target_ops: pto.vgatherb -// scenarios: core-f32, full-mask, block-gather, aligned-base, load-effect-validation, no-alias -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -This program is free software, you can redistribute it and/or modify it under the terms and conditions of -CANN Open Software License Agreement Version 2.0 (the "License"). -Please refer to the License for details. You may not use this file except in compliance with the License. -THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -See LICENSE in the root of the software repository for the full text of the License. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include -#include - -using namespace PtoTestCommon; - -#ifndef TMRGSORT_HPP -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, (int)_ret, __FILE__, __LINE__); \ - const char *_recent = aclGetRecentErrMsg(); \ - if (_recent != nullptr && _recent[0] != '\0') { \ - std::fprintf(stderr, "[ERROR] RecentErrMsg: %s\n", _recent); \ - } \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVgatherb_kernel_2d(float *v1, int *v2, float *v3, void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(float); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(int); - size_t elemCount_v3 = 1024; - size_t fileSize_v3 = elemCount_v3 * sizeof(float); - float *v1Host = nullptr; - float *v1Device = nullptr; - int *v2Host = nullptr; - int *v2Device = nullptr; - float *v3Host = nullptr; - float *v3Device = nullptr; - - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { - deviceId = std::atoi(envDevice); - } - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMallocHost((void **)(&v3Host), fileSize_v3)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v3Device, fileSize_v3, ACL_MEM_MALLOC_HUGE_FIRST)); - - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ReadFile("./v3.bin", fileSize_v3, v3Host, fileSize_v3); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v3Device, fileSize_v3, v3Host, fileSize_v3, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVgatherb_kernel_2d(v1Device, v2Device, v3Device, stream); - - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v3Host, fileSize_v3, v3Device, fileSize_v3, ACL_MEMCPY_DEVICE_TO_HOST)); - - WriteFile("./v3.bin", v3Host, fileSize_v3); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFree(v3Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - aclrtFreeHost(v3Host); - if (stream != nullptr) { - const aclError _ret = aclrtDestroyStream(stream); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtDestroyStream(stream)", (int)_ret, __FILE__, __LINE__); - } - stream = nullptr; - } - if (deviceSet) { - const aclError _ret = aclrtResetDevice(deviceId); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtResetDevice(deviceId)", (int)_ret, __FILE__, __LINE__); - } - } - if (aclInited) { - const aclError _ret = aclFinalize(); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclFinalize()", (int)_ret, __FILE__, __LINE__); - } - } - - return rc; -} diff --git a/test/vpto/cases/micro-op/gather-scatter/vscatter-out-of-order-index/compare.py b/test/vpto/cases/micro-op/gather-scatter/vscatter-out-of-order-index/compare.py deleted file mode 100755 index 016bfa5b7..000000000 --- a/test/vpto/cases/micro-op/gather-scatter/vscatter-out-of-order-index/compare.py +++ /dev/null @@ -1,209 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/gather-scatter/vscatter-out-of-order-index -# family: gather-scatter -# target_ops: pto.vscatter -# scenarios: core-f32, explicit-index-pattern, scatter-store, store-effect-validation, no-alias -# NOTE: bulk-generated coverage skeleton. -# coding=utf-8 - -import os -import sys -import numpy as np - - -REPEAT_BYTES = 256 - - -def _ceil_div(x, y): - return (x + y - 1) // y - - -def _packed_pred_storage_bytes(logical_elems, src_elem_bytes): - logical_elems = int(logical_elems) - src_elem_bytes = int(src_elem_bytes) - if logical_elems <= 0: - raise ValueError(f"logical_elems must be > 0, got {logical_elems}") - if src_elem_bytes not in (1, 2, 4): - raise ValueError(f"unsupported packed predicate source size: {src_elem_bytes}") - - repeat_elems = REPEAT_BYTES // src_elem_bytes - if src_elem_bytes == 4: - repeat_times = _ceil_div(logical_elems, repeat_elems) + 1 - loop_count = repeat_times // 2 - return loop_count * 16 - - repeat_times = _ceil_div(logical_elems, repeat_elems) - return repeat_times * (repeat_elems // 8) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - g = golden.astype(np.int64, copy=False) - o = output.astype(np.int64, copy=False) - else: - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - abs_diff = np.abs(g - o) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={g[idx]}, out={o[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_bin_prefix(golden_path, output_path, dtype, eps, count): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - try: - count = int(count) - except Exception: - print(f"[ERROR] Invalid prefix count: {count}") - return False - if count <= 0: - print(f"[ERROR] Invalid prefix count: {count}") - return False - - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np, count=count) - output = np.fromfile(output_path, dtype=dtype_np, count=count) - - if golden.size != count or output.size != count: - print( - f"[ERROR] Prefix read too small: need={count} elems, " - f"golden={golden.size}, out={output.size}" - ) - return False - - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - g = golden.astype(np.int64, copy=False) - o = output.astype(np.int64, copy=False) - else: - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - abs_diff = np.abs(g - o) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch (prefix): {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={g[idx]}, out={o[idx]}, dtype={dtype_np}, count={count})" - ) - else: - print(f"[ERROR] Mismatch (prefix): {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, logical_elems, src_elem_bytes): - """ - Compare outputs of pto.tcmp / pto.tcmps. - - PTO-ISA stores packed predicate results as a linear PK byte stream via - `psts`, with the exact written prefix length determined by the typed - TCMP/TCMPS repeat schedule. Compare only that semantic prefix. - """ - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - try: - logical_elems = int(logical_elems) - src_elem_bytes = int(src_elem_bytes) - except Exception: - print( - "[ERROR] Invalid packed mask compare arguments: " - f"logical_elems={logical_elems} src_elem_bytes={src_elem_bytes}" - ) - return False - if logical_elems <= 0 or src_elem_bytes <= 0: - print( - "[ERROR] Invalid packed mask compare arguments: " - f"logical_elems={logical_elems} src_elem_bytes={src_elem_bytes}" - ) - return False - - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - try: - prefix_bytes = _packed_pred_storage_bytes(logical_elems, src_elem_bytes) - except ValueError as exc: - print(f"[ERROR] {exc}") - return False - - if golden.size < prefix_bytes or output.size < prefix_bytes: - print( - f"[ERROR] Packed mask buffer too small: need={prefix_bytes} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - - golden_sel = golden[:prefix_bytes] - output_sel = output[:prefix_bytes] - - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, " - f"logical_elems={logical_elems}, src_elem_bytes={src_elem_bytes}, prefix_bytes={prefix_bytes})" - ) - return False - return True - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = True - ok = compare_bin("golden_v3.bin", "v3.bin", np.float32, 0.0001) and ok - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/gather-scatter/vscatter-out-of-order-index/golden.py b/test/vpto/cases/micro-op/gather-scatter/vscatter-out-of-order-index/golden.py deleted file mode 100755 index 99761f514..000000000 --- a/test/vpto/cases/micro-op/gather-scatter/vscatter-out-of-order-index/golden.py +++ /dev/null @@ -1,58 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/gather-scatter/vscatter-out-of-order-index -# family: gather-scatter -# target_ops: pto.vscatter -# scenarios: core-f32, explicit-index-pattern, scatter-store, store-effect-validation, no-alias -# NOTE: bulk-generated coverage skeleton. -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -ROWS = 32 -COLS = 32 -SEED = 19 - - -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) - flat = rng.uniform(-8.0, 8.0, size=(ROWS * COLS,)).astype(np.float32) - offsets = ((np.arange(ROWS * COLS, dtype=np.int32) * 43) + 11) % (ROWS * COLS) - scattered = np.zeros((ROWS * COLS,), dtype=np.float32) - for base in range(0, ROWS * COLS, 64): - lanes = np.arange(base + 8, base + 64, dtype=np.int32) - scattered[offsets[lanes]] = flat[lanes] - v1 = flat.reshape(ROWS, COLS) - v2 = offsets.reshape(ROWS, COLS) - v3 = np.zeros((ROWS, COLS), dtype=np.float32) - - output_dir.mkdir(parents=True, exist_ok=True) - v1.reshape(-1).tofile(output_dir / "v1.bin") - v2.reshape(-1).tofile(output_dir / "v2.bin") - v3.reshape(-1).tofile(output_dir / "v3.bin") - scattered.reshape(-1).tofile(output_dir / "golden_v3.bin") - - -def main() -> None: - parser = argparse.ArgumentParser( - description="Generate numpy-based inputs/golden for VPTO micro-op vscatter validation." - ) - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/gather-scatter/vscatter-out-of-order-index/kernel.pto b/test/vpto/cases/micro-op/gather-scatter/vscatter-out-of-order-index/kernel.pto deleted file mode 100644 index d38f20f7f..000000000 --- a/test/vpto/cases/micro-op/gather-scatter/vscatter-out-of-order-index/kernel.pto +++ /dev/null @@ -1,62 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/gather-scatter/vscatter-out-of-order-index -// family: gather-scatter -// target_ops: pto.vscatter -// scenarios: core-f32, explicit-index-pattern, scatter-store, store-effect-validation, no-alias -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vscatter_out_of_order_index_kernel_2d(%arg0: !pto.ptr, %arg1: !pto.ptr, - %arg2: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c64 = arith.constant 64 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c128_i64 = arith.constant 128 : i64 - %c4096_i64 = arith.constant 4096 : i64 - %c8192_i64 = arith.constant 8192 : i64 - %c8_i32 = arith.constant 8 : i32 - %c1024_i32 = arith.constant 1024 : i32 - - %ub_in = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_offsets = pto.castptr %c4096_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c8192_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_in, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - pto.mte_gm_ub %arg1, %ub_offsets, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - pto.mte_gm_ub %arg2, %ub_out, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %_:1 = scf.for %offset = %c0 to %c1024 step %c64 iter_args(%remaining = %c1024_i32) -> (i32) { - %prefix_mask, %next_remaining = pto.plt_b32 %c8_i32 : i32 -> !pto.mask, i32 - %full_mask = pto.pset_b32 "PAT_ALL" : !pto.mask - %suffix_mask = pto.pnot %prefix_mask, %full_mask : !pto.mask, !pto.mask -> !pto.mask - %vec = pto.vlds %ub_in[%offset] : !pto.ptr -> !pto.vreg<64xf32> - %offsets = pto.vlds %ub_offsets[%offset] : !pto.ptr -> !pto.vreg<64xi32> - pto.vscatter %vec, %ub_out, %offsets, %suffix_mask : !pto.vreg<64xf32>, !pto.ptr, !pto.vreg<64xi32>, !pto.mask - scf.yield %remaining : i32 - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg2, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/gather-scatter/vscatter-out-of-order-index/launch.cpp b/test/vpto/cases/micro-op/gather-scatter/vscatter-out-of-order-index/launch.cpp deleted file mode 100644 index 87b02ee5d..000000000 --- a/test/vpto/cases/micro-op/gather-scatter/vscatter-out-of-order-index/launch.cpp +++ /dev/null @@ -1,72 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/gather-scatter/vscatter-out-of-order-index -// family: gather-scatter -// target_ops: pto.vscatter -// scenarios: core-f32, explicit-index-pattern, scatter-store, store-effect-validation, no-alias -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -// --------------------------------------------------------------------------- -// PTOAS compatibility layer -// -// The upstream pto-isa headers reference some FP8/FP4 types and the -// __VEC_SCOPE__ marker that are not available on every AICore arch/toolchain -// combination (e.g. __NPU_ARCH__==2201). -// -// For our PTOAS-generated kernels we don't rely on these types today, but the -// headers still mention them in templates/static_asserts. Provide minimal -// fallbacks to keep compilation working on dav-c220. -// --------------------------------------------------------------------------- -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include - -// AICore printf support is gated behind `--cce-enable-print` on some -// toolchains. When enabled, include the CCE print header so `cce::printf` -// resolves in device compilation. -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -// Some PTO-ISA types are only available in the __CCE_AICORE__ compilation -// path, but `bisheng -xcce` still performs a host-side parse pass. -// Provide minimal fallbacks only when the corresponding header wasn't -// pulled in by the selected arch implementation. -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vscatter_out_of_order_index_kernel_2d( - __gm__ float *v1, __gm__ int *v2, __gm__ float *v3); - -void LaunchVscatter_out_of_order_index_kernel_2d(float *v1, int *v2, - float *v3, void *stream) { - vscatter_out_of_order_index_kernel_2d<<<1, nullptr, stream>>>( - (__gm__ float *)v1, (__gm__ int *)v2, (__gm__ float *)v3); -} diff --git a/test/vpto/cases/micro-op/gather-scatter/vscatter-out-of-order-index/main.cpp b/test/vpto/cases/micro-op/gather-scatter/vscatter-out-of-order-index/main.cpp deleted file mode 100644 index f762aa293..000000000 --- a/test/vpto/cases/micro-op/gather-scatter/vscatter-out-of-order-index/main.cpp +++ /dev/null @@ -1,141 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/gather-scatter/vscatter-out-of-order-index -// family: gather-scatter -// target_ops: pto.vscatter -// scenarios: core-f32, explicit-index-pattern, scatter-store, store-effect-validation, no-alias -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -This program is free software, you can redistribute it and/or modify it under the terms and conditions of -CANN Open Software License Agreement Version 2.0 (the "License"). -Please refer to the License for details. You may not use this file except in compliance with the License. -THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -See LICENSE in the root of the software repository for the full text of the License. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include -#include - -using namespace PtoTestCommon; - -#ifndef TMRGSORT_HPP -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, (int)_ret, __FILE__, __LINE__); \ - const char *_recent = aclGetRecentErrMsg(); \ - if (_recent != nullptr && _recent[0] != '\0') { \ - std::fprintf(stderr, "[ERROR] RecentErrMsg: %s\n", _recent); \ - } \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVscatter_out_of_order_index_kernel_2d(float *v1, int *v2, float *v3, - void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(float); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(int); - size_t elemCount_v3 = 1024; - size_t fileSize_v3 = elemCount_v3 * sizeof(float); - float *v1Host = nullptr; - float *v1Device = nullptr; - int *v2Host = nullptr; - int *v2Device = nullptr; - float *v3Host = nullptr; - float *v3Device = nullptr; - - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { - deviceId = std::atoi(envDevice); - } - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMallocHost((void **)(&v3Host), fileSize_v3)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v3Device, fileSize_v3, ACL_MEM_MALLOC_HUGE_FIRST)); - - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ReadFile("./v3.bin", fileSize_v3, v3Host, fileSize_v3); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v3Device, fileSize_v3, v3Host, fileSize_v3, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVscatter_out_of_order_index_kernel_2d(v1Device, v2Device, v3Device, stream); - - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v3Host, fileSize_v3, v3Device, fileSize_v3, ACL_MEMCPY_DEVICE_TO_HOST)); - - WriteFile("./v3.bin", v3Host, fileSize_v3); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFree(v3Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - aclrtFreeHost(v3Host); - if (stream != nullptr) { - const aclError _ret = aclrtDestroyStream(stream); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtDestroyStream(stream)", (int)_ret, __FILE__, __LINE__); - } - stream = nullptr; - } - if (deviceSet) { - const aclError _ret = aclrtResetDevice(deviceId); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtResetDevice(deviceId)", (int)_ret, __FILE__, __LINE__); - } - } - if (aclInited) { - const aclError _ret = aclFinalize(); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclFinalize()", (int)_ret, __FILE__, __LINE__); - } - } - - return rc; -} diff --git a/test/vpto/cases/micro-op/gather-scatter/vscatter/compare.py b/test/vpto/cases/micro-op/gather-scatter/vscatter/compare.py deleted file mode 100755 index ada19a30e..000000000 --- a/test/vpto/cases/micro-op/gather-scatter/vscatter/compare.py +++ /dev/null @@ -1,209 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/gather-scatter/vscatter -# family: gather-scatter -# target_ops: pto.vscatter -# scenarios: core-f32, full-mask, non-contiguous, explicit-index-pattern, scatter-store, store-effect-validation, no-alias -# NOTE: bulk-generated coverage skeleton. -# coding=utf-8 - -import os -import sys -import numpy as np - - -REPEAT_BYTES = 256 - - -def _ceil_div(x, y): - return (x + y - 1) // y - - -def _packed_pred_storage_bytes(logical_elems, src_elem_bytes): - logical_elems = int(logical_elems) - src_elem_bytes = int(src_elem_bytes) - if logical_elems <= 0: - raise ValueError(f"logical_elems must be > 0, got {logical_elems}") - if src_elem_bytes not in (1, 2, 4): - raise ValueError(f"unsupported packed predicate source size: {src_elem_bytes}") - - repeat_elems = REPEAT_BYTES // src_elem_bytes - if src_elem_bytes == 4: - repeat_times = _ceil_div(logical_elems, repeat_elems) + 1 - loop_count = repeat_times // 2 - return loop_count * 16 - - repeat_times = _ceil_div(logical_elems, repeat_elems) - return repeat_times * (repeat_elems // 8) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - g = golden.astype(np.int64, copy=False) - o = output.astype(np.int64, copy=False) - else: - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - abs_diff = np.abs(g - o) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={g[idx]}, out={o[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_bin_prefix(golden_path, output_path, dtype, eps, count): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - try: - count = int(count) - except Exception: - print(f"[ERROR] Invalid prefix count: {count}") - return False - if count <= 0: - print(f"[ERROR] Invalid prefix count: {count}") - return False - - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np, count=count) - output = np.fromfile(output_path, dtype=dtype_np, count=count) - - if golden.size != count or output.size != count: - print( - f"[ERROR] Prefix read too small: need={count} elems, " - f"golden={golden.size}, out={output.size}" - ) - return False - - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - g = golden.astype(np.int64, copy=False) - o = output.astype(np.int64, copy=False) - else: - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - abs_diff = np.abs(g - o) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch (prefix): {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={g[idx]}, out={o[idx]}, dtype={dtype_np}, count={count})" - ) - else: - print(f"[ERROR] Mismatch (prefix): {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, logical_elems, src_elem_bytes): - """ - Compare outputs of pto.tcmp / pto.tcmps. - - PTO-ISA stores packed predicate results as a linear PK byte stream via - `psts`, with the exact written prefix length determined by the typed - TCMP/TCMPS repeat schedule. Compare only that semantic prefix. - """ - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - try: - logical_elems = int(logical_elems) - src_elem_bytes = int(src_elem_bytes) - except Exception: - print( - "[ERROR] Invalid packed mask compare arguments: " - f"logical_elems={logical_elems} src_elem_bytes={src_elem_bytes}" - ) - return False - if logical_elems <= 0 or src_elem_bytes <= 0: - print( - "[ERROR] Invalid packed mask compare arguments: " - f"logical_elems={logical_elems} src_elem_bytes={src_elem_bytes}" - ) - return False - - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - try: - prefix_bytes = _packed_pred_storage_bytes(logical_elems, src_elem_bytes) - except ValueError as exc: - print(f"[ERROR] {exc}") - return False - - if golden.size < prefix_bytes or output.size < prefix_bytes: - print( - f"[ERROR] Packed mask buffer too small: need={prefix_bytes} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - - golden_sel = golden[:prefix_bytes] - output_sel = output[:prefix_bytes] - - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, " - f"logical_elems={logical_elems}, src_elem_bytes={src_elem_bytes}, prefix_bytes={prefix_bytes})" - ) - return False - return True - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = True - ok = compare_bin("golden_v3.bin", "v3.bin", np.float32, 0.0001) and ok - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/gather-scatter/vscatter/golden.py b/test/vpto/cases/micro-op/gather-scatter/vscatter/golden.py deleted file mode 100755 index 252356095..000000000 --- a/test/vpto/cases/micro-op/gather-scatter/vscatter/golden.py +++ /dev/null @@ -1,56 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/gather-scatter/vscatter -# family: gather-scatter -# target_ops: pto.vscatter -# scenarios: core-f32, full-mask, non-contiguous, explicit-index-pattern, scatter-store, store-effect-validation, no-alias -# NOTE: bulk-generated coverage skeleton. -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -ROWS = 32 -COLS = 32 -SEED = 19 - - -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) - flat = rng.uniform(-8.0, 8.0, size=(ROWS * COLS,)).astype(np.float32) - offsets = ((np.arange(ROWS * COLS, dtype=np.int32) * 29) + 7) % (ROWS * COLS) - scattered = np.zeros((ROWS * COLS,), dtype=np.float32) - scattered[offsets] = flat - v1 = flat.reshape(ROWS, COLS) - v2 = offsets.reshape(ROWS, COLS) - v3 = np.zeros((ROWS, COLS), dtype=np.float32) - - output_dir.mkdir(parents=True, exist_ok=True) - v1.reshape(-1).tofile(output_dir / "v1.bin") - v2.reshape(-1).tofile(output_dir / "v2.bin") - v3.reshape(-1).tofile(output_dir / "v3.bin") - scattered.reshape(-1).tofile(output_dir / "golden_v3.bin") - - -def main() -> None: - parser = argparse.ArgumentParser( - description="Generate numpy-based inputs/golden for VPTO micro-op vscatter validation." - ) - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/gather-scatter/vscatter/kernel.pto b/test/vpto/cases/micro-op/gather-scatter/vscatter/kernel.pto deleted file mode 100644 index 7849e4824..000000000 --- a/test/vpto/cases/micro-op/gather-scatter/vscatter/kernel.pto +++ /dev/null @@ -1,78 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/gather-scatter/vscatter -// family: gather-scatter -// target_ops: pto.vscatter -// scenarios: core-f32, full-mask, non-contiguous, explicit-index-pattern, scatter-store, store-effect-validation, no-alias -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -// ============================================================================= -// abs_kernel_2d: Element-wise absolute value on a 32x32 f32 tile -// ============================================================================= -// This kernel computes abs(input) for a 32x32 float32 matrix (1024 elements). -// -// Memory Layout: -// - Input: arg0 -> GM (Global Memory) -// - Output: arg1 -> GM (Global Memory) -// - UB (Unified Buffer) at offset 0: input tile (4096 bytes = 32*32*4) -// - UB at offset 4096: output tile (4096 bytes = 32*32*4) -// -// Pipeline: -// 1. DMA: GM -> UB (MTE2 pipe) - copy input tile to UB -// 2. Sync: wait for MTE2 -> V pipe handoff -// 3. Compute: vabs on 64-element vectors (V pipe) - 16 iterations for 1024 elements -// 4. Sync: wait for V -> MTE3 pipe handoff -// 5. DMA: UB -> GM (MTE3 pipe) - copy result tile back to GM -// ============================================================================= - -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vscatter_kernel_2d(%arg0: !pto.ptr, %arg1: !pto.ptr, - %arg2: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c64 = arith.constant 64 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c128_i64 = arith.constant 128 : i64 - %c4096_i64 = arith.constant 4096 : i64 - %c8192_i64 = arith.constant 8192 : i64 - %c1024_i32 = arith.constant 1024 : i32 - - %ub_in = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_offsets = pto.castptr %c4096_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c8192_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_in, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - pto.mte_gm_ub %arg1, %ub_offsets, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - pto.mte_gm_ub %arg2, %ub_out, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %_:1 = scf.for %offset = %c0 to %c1024 step %c64 iter_args(%remaining = %c1024_i32) -> (i32) { - %mask, %next_remaining = pto.plt_b32 %remaining : i32 -> !pto.mask, i32 - %vec = pto.vlds %ub_in[%offset] : !pto.ptr -> !pto.vreg<64xf32> - %offsets = pto.vlds %ub_offsets[%offset] : !pto.ptr -> !pto.vreg<64xi32> - pto.vscatter %vec, %ub_out, %offsets, %mask : !pto.vreg<64xf32>, !pto.ptr, !pto.vreg<64xi32>, !pto.mask - scf.yield %next_remaining : i32 - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg2, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/gather-scatter/vscatter/launch.cpp b/test/vpto/cases/micro-op/gather-scatter/vscatter/launch.cpp deleted file mode 100644 index 79296467b..000000000 --- a/test/vpto/cases/micro-op/gather-scatter/vscatter/launch.cpp +++ /dev/null @@ -1,73 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/gather-scatter/vscatter -// family: gather-scatter -// target_ops: pto.vscatter -// scenarios: core-f32, full-mask, non-contiguous, explicit-index-pattern, scatter-store, store-effect-validation, no-alias -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -// --------------------------------------------------------------------------- -// PTOAS compatibility layer -// -// The upstream pto-isa headers reference some FP8/FP4 types and the -// __VEC_SCOPE__ marker that are not available on every AICore arch/toolchain -// combination (e.g. __NPU_ARCH__==2201). -// -// For our PTOAS-generated kernels we don't rely on these types today, but the -// headers still mention them in templates/static_asserts. Provide minimal -// fallbacks to keep compilation working on dav-c220. -// --------------------------------------------------------------------------- -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include - -// AICore printf support is gated behind `--cce-enable-print` on some -// toolchains. When enabled, include the CCE print header so `cce::printf` -// resolves in device compilation. -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -// Some PTO-ISA types are only available in the __CCE_AICORE__ compilation -// path, but `bisheng -xcce` still performs a host-side parse pass. -// Provide minimal fallbacks only when the corresponding header wasn't -// pulled in by the selected arch implementation. -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vscatter_kernel_2d(__gm__ float *v1, - __gm__ int *v2, - __gm__ float *v3); - -void LaunchVscatter_kernel_2d(float *v1, int *v2, float *v3, void *stream) { - vscatter_kernel_2d<<<1, nullptr, stream>>>((__gm__ float *)v1, - (__gm__ int *)v2, - (__gm__ float *)v3); -} diff --git a/test/vpto/cases/micro-op/gather-scatter/vscatter/main.cpp b/test/vpto/cases/micro-op/gather-scatter/vscatter/main.cpp deleted file mode 100644 index 613ee0282..000000000 --- a/test/vpto/cases/micro-op/gather-scatter/vscatter/main.cpp +++ /dev/null @@ -1,140 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/gather-scatter/vscatter -// family: gather-scatter -// target_ops: pto.vscatter -// scenarios: core-f32, full-mask, non-contiguous, explicit-index-pattern, scatter-store, store-effect-validation, no-alias -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -This program is free software, you can redistribute it and/or modify it under the terms and conditions of -CANN Open Software License Agreement Version 2.0 (the "License"). -Please refer to the License for details. You may not use this file except in compliance with the License. -THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -See LICENSE in the root of the software repository for the full text of the License. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include -#include - -using namespace PtoTestCommon; - -#ifndef TMRGSORT_HPP -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, (int)_ret, __FILE__, __LINE__); \ - const char *_recent = aclGetRecentErrMsg(); \ - if (_recent != nullptr && _recent[0] != '\0') { \ - std::fprintf(stderr, "[ERROR] RecentErrMsg: %s\n", _recent); \ - } \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVscatter_kernel_2d(float *v1, int *v2, float *v3, void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(float); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(int); - size_t elemCount_v3 = 1024; - size_t fileSize_v3 = elemCount_v3 * sizeof(float); - float *v1Host = nullptr; - float *v1Device = nullptr; - int *v2Host = nullptr; - int *v2Device = nullptr; - float *v3Host = nullptr; - float *v3Device = nullptr; - - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { - deviceId = std::atoi(envDevice); - } - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMallocHost((void **)(&v3Host), fileSize_v3)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v3Device, fileSize_v3, ACL_MEM_MALLOC_HUGE_FIRST)); - - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ReadFile("./v3.bin", fileSize_v3, v3Host, fileSize_v3); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v3Device, fileSize_v3, v3Host, fileSize_v3, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVscatter_kernel_2d(v1Device, v2Device, v3Device, stream); - - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v3Host, fileSize_v3, v3Device, fileSize_v3, ACL_MEMCPY_DEVICE_TO_HOST)); - - WriteFile("./v3.bin", v3Host, fileSize_v3); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFree(v3Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - aclrtFreeHost(v3Host); - if (stream != nullptr) { - const aclError _ret = aclrtDestroyStream(stream); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtDestroyStream(stream)", (int)_ret, __FILE__, __LINE__); - } - stream = nullptr; - } - if (deviceSet) { - const aclError _ret = aclrtResetDevice(deviceId); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtResetDevice(deviceId)", (int)_ret, __FILE__, __LINE__); - } - } - if (aclInited) { - const aclError _ret = aclFinalize(); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclFinalize()", (int)_ret, __FILE__, __LINE__); - } - } - - return rc; -} diff --git a/test/vpto/cases/micro-op/materialization-predicate/pand/kernel.pto b/test/vpto/cases/micro-op/materialization-predicate/pand/kernel.pto index 2b65c8fdd..f702ceffa 100644 --- a/test/vpto/cases/micro-op/materialization-predicate/pand/kernel.pto +++ b/test/vpto/cases/micro-op/materialization-predicate/pand/kernel.pto @@ -51,6 +51,96 @@ module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind, !pto.ptr, i64, i64, i64, i64 pto.barrier #pto.pipe + %__case_merge_guard = arith.constant false + + // inactive cross-case merge from micro-op/materialization-predicate/pnot + scf.if %__case_merge_guard { + + %c0_cmg7_1 = arith.constant 0 : index + %c1_cmg7_1 = arith.constant 1 : index + %c32_cmg7_1 = arith.constant 32 : index + %c0_i64_cmg7_1 = arith.constant 0 : i64 + %c1_i64_cmg7_1 = arith.constant 1 : i64 + %c32_i64_cmg7_1 = arith.constant 32 : i64 + %c13_cmg7_1 = arith.constant 13 : i32 + + %ub_out_cmg7_1 = pto.castptr %c0_i64_cmg7_1 : i64 -> !pto.ptr + %gm_out_cmg7_1 = pto.castptr %arg0 : !pto.ptr -> !pto.ptr + + pto.vecscope { + scf.for %iter_cmg7_1 = %c0_cmg7_1 to %c1_cmg7_1 step %c1_cmg7_1 { + %all_cmg7_1 = pto.pset_b32 "PAT_ALL" : !pto.mask + %half_cmg7_1, %next_cmg7_1 = pto.plt_b32 %c13_cmg7_1 : i32 -> !pto.mask, i32 + %out_cmg7_1 = pto.pnot %half_cmg7_1, %all_cmg7_1 : !pto.mask, !pto.mask -> !pto.mask + pto.psts %out_cmg7_1, %ub_out_cmg7_1[%c0_cmg7_1], "NORM" : !pto.mask, !pto.ptr, index + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_cmg7_1, %gm_out_cmg7_1, %c32_i64_cmg7_1 + nburst(%c1_i64_cmg7_1, %c32_i64_cmg7_1, %c32_i64_cmg7_1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + } + + // inactive cross-case merge from micro-op/materialization-predicate/por + scf.if %__case_merge_guard { + + %c0_cmg7_2 = arith.constant 0 : index + %c0_i64_cmg7_2 = arith.constant 0 : i64 + %c1_i64_cmg7_2 = arith.constant 1 : i64 + %c32_i64_cmg7_2 = arith.constant 32 : i64 + %c7_cmg7_2 = arith.constant 7 : i32 + %c13_cmg7_2 = arith.constant 13 : i32 + + %ub_out_cmg7_2 = pto.castptr %c0_i64_cmg7_2 : i64 -> !pto.ptr + %gm_out_cmg7_2 = pto.castptr %arg0 : !pto.ptr -> !pto.ptr + + pto.vecscope { + %all_cmg7_2 = pto.pset_b32 "PAT_ALL" : !pto.mask + %lhs_cmg7_2, %lhs_next_cmg7_2 = pto.plt_b32 %c13_cmg7_2 : i32 -> !pto.mask, i32 + %rhs_cmg7_2, %rhs_next_cmg7_2 = pto.plt_b32 %c7_cmg7_2 : i32 -> !pto.mask, i32 + %out_cmg7_2 = pto.por %lhs_cmg7_2, %rhs_cmg7_2, %all_cmg7_2 : !pto.mask, !pto.mask, !pto.mask -> !pto.mask + pto.psts %out_cmg7_2, %ub_out_cmg7_2[%c0_cmg7_2], "NORM" : !pto.mask, !pto.ptr, index + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_cmg7_2, %gm_out_cmg7_2, %c32_i64_cmg7_2 + nburst(%c1_i64_cmg7_2, %c32_i64_cmg7_2, %c32_i64_cmg7_2) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + } + + // inactive cross-case merge from micro-op/materialization-predicate/pxor + scf.if %__case_merge_guard { + + %c0_cmg7_3 = arith.constant 0 : index + %c0_i64_cmg7_3 = arith.constant 0 : i64 + %c1_i64_cmg7_3 = arith.constant 1 : i64 + %c32_i64_cmg7_3 = arith.constant 32 : i64 + %c7_cmg7_3 = arith.constant 7 : i32 + %c13_cmg7_3 = arith.constant 13 : i32 + + %ub_out_cmg7_3 = pto.castptr %c0_i64_cmg7_3 : i64 -> !pto.ptr + %gm_out_cmg7_3 = pto.castptr %arg0 : !pto.ptr -> !pto.ptr + + pto.vecscope { + %all_cmg7_3 = pto.pset_b32 "PAT_ALL" : !pto.mask + %lhs_cmg7_3, %lhs_next_cmg7_3 = pto.plt_b32 %c13_cmg7_3 : i32 -> !pto.mask, i32 + %rhs_cmg7_3, %rhs_next_cmg7_3 = pto.plt_b32 %c7_cmg7_3 : i32 -> !pto.mask, i32 + %out_cmg7_3 = pto.pxor %lhs_cmg7_3, %rhs_cmg7_3, %all_cmg7_3 : !pto.mask, !pto.mask, !pto.mask -> !pto.mask + pto.psts %out_cmg7_3, %ub_out_cmg7_3[%c0_cmg7_3], "NORM" : !pto.mask, !pto.ptr, index + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_cmg7_3, %gm_out_cmg7_3, %c32_i64_cmg7_3 + nburst(%c1_i64_cmg7_3, %c32_i64_cmg7_3, %c32_i64_cmg7_3) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + } return } } diff --git a/test/vpto/cases/micro-op/materialization-predicate/pdintlv_b16-nontrivial/compare.py b/test/vpto/cases/micro-op/materialization-predicate/pdintlv_b16-nontrivial/compare.py deleted file mode 100755 index 25e69c8a2..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/pdintlv_b16-nontrivial/compare.py +++ /dev/null @@ -1,64 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/materialization-predicate/pdintlv_b16-nontrivial -# family: materialization-predicate -# target_ops: pto.pdintlv_b16 -# scenarios: predicate-transform, lane-order, representative-logical-elements -# coding=utf-8 - -import os -import sys - -import numpy as np - - -EXPECTED_WORDS = 32 -PREFIX_WORDS = 16 - - -def compare_words(golden_path, output_path): - if not os.path.exists(output_path) or not os.path.exists(golden_path): - print(f"[ERROR] Missing file: golden={golden_path} out={output_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint32) - output = np.fromfile(output_path, dtype=np.uint32) - if golden.size != EXPECTED_WORDS or output.size != EXPECTED_WORDS: - print( - f"[ERROR] Unexpected word count: golden={golden.size} " - f"out={output.size} expected={EXPECTED_WORDS}" - ) - return False - golden = golden[:PREFIX_WORDS] - output = output[:PREFIX_WORDS] - if not np.array_equal(golden, output): - diff = np.nonzero(golden != output)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed predicate words): idx={idx} " - f"golden={int(golden[idx])} out={int(output[idx])}" - ) - return False - return True - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_words("golden_v1.bin", "v1.bin") - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/materialization-predicate/pdintlv_b16-nontrivial/golden.py b/test/vpto/cases/micro-op/materialization-predicate/pdintlv_b16-nontrivial/golden.py deleted file mode 100755 index 814eb34b5..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/pdintlv_b16-nontrivial/golden.py +++ /dev/null @@ -1,49 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/materialization-predicate/pdintlv_b16-nontrivial -# family: materialization-predicate -# target_ops: pto.pdintlv_b16 -# scenarios: predicate-transform, lane-order, representative-logical-elements -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -SEED = 19 -OUTPUT_WORDS = 32 -GOLDEN_PREFIX_WORDS = np.array([85, 0, 0, 0, 286331153, 286331153, 286331153, 286331153, 85, 0, 0, 0, 0, 0, 0, 0], dtype=np.uint32) - - -def generate(output_dir: Path, seed: int) -> None: - del seed - output_init = np.zeros((OUTPUT_WORDS,), dtype=np.uint32) - golden = np.zeros((OUTPUT_WORDS,), dtype=np.uint32) - golden[: GOLDEN_PREFIX_WORDS.size] = GOLDEN_PREFIX_WORDS - - output_dir.mkdir(parents=True, exist_ok=True) - output_init.tofile(output_dir / "v1.bin") - golden.tofile(output_dir / "golden_v1.bin") - - -def main() -> None: - parser = argparse.ArgumentParser( - description="Generate packed predicate golden for VPTO micro-op validation." - ) - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/materialization-predicate/pdintlv_b16-nontrivial/kernel.pto b/test/vpto/cases/micro-op/materialization-predicate/pdintlv_b16-nontrivial/kernel.pto deleted file mode 100644 index 16cc2a84d..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/pdintlv_b16-nontrivial/kernel.pto +++ /dev/null @@ -1,38 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/materialization-predicate/pdintlv_b16-nontrivial -// family: materialization-predicate -// target_ops: pto.pdintlv_b16 -// scenarios: predicate-transform, lane-order, nontrivial-pattern -// ----------------------------------------------------------------------------- -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @pdintlv_b16_nontrivial_kernel_2d(%arg0: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c32 = arith.constant 32 : index - %c16 = arith.constant 16 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c64_i64 = arith.constant 64 : i64 - - %ub_out = pto.castptr %c0_i64 : i64 -> !pto.ptr - %gm_out = pto.castptr %arg0 : !pto.ptr -> !pto.ptr - - pto.vecscope { - scf.for %iter = %c0 to %c1 step %c1 { - %lhs = pto.pset_b16 "PAT_VL8" : !pto.mask - %rhs = pto.pset_b16 "PAT_M4" : !pto.mask - %low, %high = pto.pdintlv_b16 %lhs, %rhs : !pto.mask, !pto.mask -> !pto.mask, !pto.mask - pto.psts %low, %ub_out[%c0], "NORM" : !pto.mask, !pto.ptr, index - pto.psts %high, %ub_out[%c32], "NORM" : !pto.mask, !pto.ptr, index - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %gm_out, %c64_i64 - nburst(%c1_i64, %c64_i64, %c64_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/materialization-predicate/pdintlv_b16-nontrivial/launch.cpp b/test/vpto/cases/micro-op/materialization-predicate/pdintlv_b16-nontrivial/launch.cpp deleted file mode 100644 index 182f92536..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/pdintlv_b16-nontrivial/launch.cpp +++ /dev/null @@ -1,48 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/materialization-predicate/pdintlv_b16-nontrivial -// family: materialization-predicate -// target_ops: pto.pdintlv_b16 -// scenarios: predicate-transform, lane-order, nontrivial-pattern -// --------------------------------------------------------------------------- -// PTOAS compatibility layer -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void pdintlv_b16_nontrivial_kernel_2d(__gm__ uint32_t *v1); - -void LaunchPdintlvB16Nontrivial(uint32_t *v1, void *stream) { - pdintlv_b16_nontrivial_kernel_2d<<<1, nullptr, stream>>>((__gm__ uint32_t *)v1); -} diff --git a/test/vpto/cases/micro-op/materialization-predicate/pdintlv_b16-nontrivial/main.cpp b/test/vpto/cases/micro-op/materialization-predicate/pdintlv_b16-nontrivial/main.cpp deleted file mode 100644 index 02d8a3875..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/pdintlv_b16-nontrivial/main.cpp +++ /dev/null @@ -1,104 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/materialization-predicate/pdintlv_b16-nontrivial -// family: materialization-predicate -// target_ops: pto.pdintlv_b16 -// scenarios: predicate-transform, lane-order, nontrivial-pattern -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -This program is free software, you can redistribute it and/or modify it under the terms and conditions of -CANN Open Software License Agreement Version 2.0 (the "License"). -Please refer to the License for details. You may not use this file except in compliance with the License. -THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -See LICENSE in the root of the software repository for the full text of the License. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include -#include - -using namespace PtoTestCommon; - -#ifndef TMRGSORT_HPP -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif - -#define ACL_CHECK(expr) do { const aclError _ret = (expr); if (_ret != ACL_SUCCESS) { std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, (int)_ret, __FILE__, __LINE__); const char *_recent = aclGetRecentErrMsg(); if (_recent != nullptr && _recent[0] != '\0') { std::fprintf(stderr, "[ERROR] RecentErrMsg: %s\n", _recent); } rc = 1; goto cleanup; } } while (0) - -void LaunchPdintlvB16Nontrivial(uint32_t *v1, void *stream); - -int main() { - size_t elemCount_v1 = 32; - size_t fileSize_v1 = elemCount_v1 * sizeof(uint32_t); - uint32_t *v1Host = nullptr; - uint32_t *v1Device = nullptr; - - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { - deviceId = std::atoi(envDevice); - } - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchPdintlvB16Nontrivial(v1Device, stream); - - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v1Host, fileSize_v1, v1Device, fileSize_v1, ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v1.bin", v1Host, fileSize_v1); - -cleanup: - aclrtFree(v1Device); - aclrtFreeHost(v1Host); - if (stream != nullptr) { - const aclError _ret = aclrtDestroyStream(stream); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtDestroyStream(stream)", (int)_ret, __FILE__, __LINE__); - } - stream = nullptr; - } - if (deviceSet) { - const aclError _ret = aclrtResetDevice(deviceId); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtResetDevice(deviceId)", (int)_ret, __FILE__, __LINE__); - } - } - if (aclInited) { - const aclError _ret = aclFinalize(); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclFinalize()", (int)_ret, __FILE__, __LINE__); - } - } - - return rc; -} diff --git a/test/vpto/cases/micro-op/materialization-predicate/pdintlv_b16/kernel.pto b/test/vpto/cases/micro-op/materialization-predicate/pdintlv_b16/kernel.pto index af38280cf..f497ed7fb 100644 --- a/test/vpto/cases/micro-op/materialization-predicate/pdintlv_b16/kernel.pto +++ b/test/vpto/cases/micro-op/materialization-predicate/pdintlv_b16/kernel.pto @@ -1,38 +1,749 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/materialization-predicate/pdintlv_b16 -// family: materialization-predicate -// target_ops: pto.pdintlv_b16 -// scenarios: predicate-transform, lane-order -// ----------------------------------------------------------------------------- module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @pdintlv_b16_kernel_2d(%arg0: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c32 = arith.constant 32 : index - %c16 = arith.constant 16 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c64_i64 = arith.constant 64 : i64 - - %ub_out = pto.castptr %c0_i64 : i64 -> !pto.ptr - %gm_out = pto.castptr %arg0 : !pto.ptr -> !pto.ptr + func.func @pdintlv_b16_deep_merged_kernel(%arg0: !pto.ptr, %arg1: !pto.ptr) attributes {pto.kernel} { + %__deep_merge_guard = arith.constant false + // inactive merged from pdintlv_b16_nontrivial_kernel_2d + scf.if %__deep_merge_guard { + + %c0_m0 = arith.constant 0 : index + %c1_m0 = arith.constant 1 : index + %c32_m0 = arith.constant 32 : index + %c16_m0 = arith.constant 16 : index + %c0_i64_m0 = arith.constant 0 : i64 + %c1_i64_m0 = arith.constant 1 : i64 + %c64_i64_m0 = arith.constant 64 : i64 + + %ub_out_m0 = pto.castptr %c0_i64_m0 : i64 -> !pto.ptr + %gm_out_m0 = pto.castptr %arg0 : !pto.ptr -> !pto.ptr + + pto.vecscope { + scf.for %iter_m0 = %c0_m0 to %c1_m0 step %c1_m0 { + %lhs_m0 = pto.pset_b16 "PAT_VL8" : !pto.mask + %rhs_m0 = pto.pset_b16 "PAT_M4" : !pto.mask + %low_m0, %high_m0 = pto.pdintlv_b16 %lhs_m0, %rhs_m0 : !pto.mask, !pto.mask -> !pto.mask, !pto.mask + pto.psts %low_m0, %ub_out_m0[%c0_m0], "NORM" : !pto.mask, !pto.ptr, index + pto.psts %high_m0, %ub_out_m0[%c32_m0], "NORM" : !pto.mask, !pto.ptr, index + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m0, %gm_out_m0, %c64_i64_m0 + nburst(%c1_i64_m0, %c64_i64_m0, %c64_i64_m0) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + + } + // active merged from pdintlv_b16_kernel_2d + + %c0_m1 = arith.constant 0 : index + %c1_m1 = arith.constant 1 : index + %c32_m1 = arith.constant 32 : index + %c16_m1 = arith.constant 16 : index + %c0_i64_m1 = arith.constant 0 : i64 + %c1_i64_m1 = arith.constant 1 : i64 + %c64_i64_m1 = arith.constant 64 : i64 + + %ub_out_m1 = pto.castptr %c0_i64_m1 : i64 -> !pto.ptr + %gm_out_m1 = pto.castptr %arg1 : !pto.ptr -> !pto.ptr pto.vecscope { - scf.for %iter = %c0 to %c1 step %c1 { - %lhs = pto.pset_b16 "PAT_ALL" : !pto.mask - %rhs = pto.pset_b16 "PAT_ALLF" : !pto.mask - %low, %high = pto.pdintlv_b16 %lhs, %rhs : !pto.mask, !pto.mask -> !pto.mask, !pto.mask - pto.psts %low, %ub_out[%c0], "NORM" : !pto.mask, !pto.ptr, index - pto.psts %high, %ub_out[%c32], "NORM" : !pto.mask, !pto.ptr, index + scf.for %iter_m1 = %c0_m1 to %c1_m1 step %c1_m1 { + %lhs_m1 = pto.pset_b16 "PAT_ALL" : !pto.mask + %rhs_m1 = pto.pset_b16 "PAT_ALLF" : !pto.mask + %low_m1, %high_m1 = pto.pdintlv_b16 %lhs_m1, %rhs_m1 : !pto.mask, !pto.mask -> !pto.mask, !pto.mask + pto.psts %low_m1, %ub_out_m1[%c0_m1], "NORM" : !pto.mask, !pto.ptr, index + pto.psts %high_m1, %ub_out_m1[%c32_m1], "NORM" : !pto.mask, !pto.ptr, index } } pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %gm_out, %c64_i64 - nburst(%c1_i64, %c64_i64, %c64_i64) + pto.mte_ub_gm %ub_out_m1, %gm_out_m1, %c64_i64_m1 + nburst(%c1_i64_m1, %c64_i64_m1, %c64_i64_m1) : !pto.ptr, !pto.ptr, i64, i64, i64, i64 pto.barrier #pto.pipe + %__case_merge_guard = arith.constant false + + // inactive cross-case merge from micro-op/materialization-predicate/pdintlv_b32 + scf.if %__case_merge_guard { + + %__deep_merge_guard_cmg8_1 = arith.constant false + // inactive merged from pdintlv_b32_nontrivial_kernel_2d + scf.if %__deep_merge_guard_cmg8_1 { + + %c0_m0_cmg8_1 = arith.constant 0 : index + %c1_m0_cmg8_1 = arith.constant 1 : index + %c32_m0_cmg8_1 = arith.constant 32 : index + %c16_m0_cmg8_1 = arith.constant 16 : index + %c0_i64_m0_cmg8_1 = arith.constant 0 : i64 + %c1_i64_m0_cmg8_1 = arith.constant 1 : i64 + %c64_i64_m0_cmg8_1 = arith.constant 64 : i64 + + %ub_out_m0_cmg8_1 = pto.castptr %c0_i64_m0_cmg8_1 : i64 -> !pto.ptr + %gm_out_m0_cmg8_1 = pto.castptr %arg0 : !pto.ptr -> !pto.ptr + + pto.vecscope { + scf.for %iter_m0_cmg8_1 = %c0_m0_cmg8_1 to %c1_m0_cmg8_1 step %c1_m0_cmg8_1 { + %lhs_m0_cmg8_1 = pto.pset_b32 "PAT_VL8" : !pto.mask + %rhs_m0_cmg8_1 = pto.pset_b32 "PAT_M4" : !pto.mask + %low_m0_cmg8_1, %high_m0_cmg8_1 = pto.pdintlv_b32 %lhs_m0_cmg8_1, %rhs_m0_cmg8_1 : !pto.mask, !pto.mask -> !pto.mask, !pto.mask + pto.psts %low_m0_cmg8_1, %ub_out_m0_cmg8_1[%c0_m0_cmg8_1], "NORM" : !pto.mask, !pto.ptr, index + pto.psts %high_m0_cmg8_1, %ub_out_m0_cmg8_1[%c32_m0_cmg8_1], "NORM" : !pto.mask, !pto.ptr, index + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m0_cmg8_1, %gm_out_m0_cmg8_1, %c64_i64_m0_cmg8_1 + nburst(%c1_i64_m0_cmg8_1, %c64_i64_m0_cmg8_1, %c64_i64_m0_cmg8_1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + + } + // active merged from pdintlv_b32_kernel_2d + + %c0_m1_cmg8_1 = arith.constant 0 : index + %c1_m1_cmg8_1 = arith.constant 1 : index + %c32_m1_cmg8_1 = arith.constant 32 : index + %c16_m1_cmg8_1 = arith.constant 16 : index + %c0_i64_m1_cmg8_1 = arith.constant 0 : i64 + %c1_i64_m1_cmg8_1 = arith.constant 1 : i64 + %c64_i64_m1_cmg8_1 = arith.constant 64 : i64 + + %ub_out_m1_cmg8_1 = pto.castptr %c0_i64_m1_cmg8_1 : i64 -> !pto.ptr + %gm_out_m1_cmg8_1 = pto.castptr %arg1 : !pto.ptr -> !pto.ptr + + pto.vecscope { + scf.for %iter_m1_cmg8_1 = %c0_m1_cmg8_1 to %c1_m1_cmg8_1 step %c1_m1_cmg8_1 { + %lhs_m1_cmg8_1 = pto.pset_b32 "PAT_ALL" : !pto.mask + %rhs_m1_cmg8_1 = pto.pset_b32 "PAT_ALLF" : !pto.mask + %low_m1_cmg8_1, %high_m1_cmg8_1 = pto.pdintlv_b32 %lhs_m1_cmg8_1, %rhs_m1_cmg8_1 : !pto.mask, !pto.mask -> !pto.mask, !pto.mask + pto.psts %low_m1_cmg8_1, %ub_out_m1_cmg8_1[%c0_m1_cmg8_1], "NORM" : !pto.mask, !pto.ptr, index + pto.psts %high_m1_cmg8_1, %ub_out_m1_cmg8_1[%c32_m1_cmg8_1], "NORM" : !pto.mask, !pto.ptr, index + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m1_cmg8_1, %gm_out_m1_cmg8_1, %c64_i64_m1_cmg8_1 + nburst(%c1_i64_m1_cmg8_1, %c64_i64_m1_cmg8_1, %c64_i64_m1_cmg8_1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + } + + // inactive cross-case merge from micro-op/materialization-predicate/pdintlv_b8 + scf.if %__case_merge_guard { + + %__deep_merge_guard_cmg8_2 = arith.constant false + // inactive merged from pdintlv_b8_nontrivial_kernel_2d + scf.if %__deep_merge_guard_cmg8_2 { + + %c0_m0_cmg8_2 = arith.constant 0 : index + %c1_m0_cmg8_2 = arith.constant 1 : index + %c32_m0_cmg8_2 = arith.constant 32 : index + %c16_m0_cmg8_2 = arith.constant 16 : index + %c0_i64_m0_cmg8_2 = arith.constant 0 : i64 + %c1_i64_m0_cmg8_2 = arith.constant 1 : i64 + %c64_i64_m0_cmg8_2 = arith.constant 64 : i64 + + %ub_out_m0_cmg8_2 = pto.castptr %c0_i64_m0_cmg8_2 : i64 -> !pto.ptr + %gm_out_m0_cmg8_2 = pto.castptr %arg0 : !pto.ptr -> !pto.ptr + + pto.vecscope { + scf.for %iter_m0_cmg8_2 = %c0_m0_cmg8_2 to %c1_m0_cmg8_2 step %c1_m0_cmg8_2 { + %lhs_m0_cmg8_2 = pto.pset_b8 "PAT_VL8" : !pto.mask + %rhs_m0_cmg8_2 = pto.pset_b8 "PAT_M4" : !pto.mask + %low_m0_cmg8_2, %high_m0_cmg8_2 = pto.pdintlv_b8 %lhs_m0_cmg8_2, %rhs_m0_cmg8_2 : !pto.mask, !pto.mask -> !pto.mask, !pto.mask + pto.psts %low_m0_cmg8_2, %ub_out_m0_cmg8_2[%c0_m0_cmg8_2], "NORM" : !pto.mask, !pto.ptr, index + pto.psts %high_m0_cmg8_2, %ub_out_m0_cmg8_2[%c32_m0_cmg8_2], "NORM" : !pto.mask, !pto.ptr, index + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m0_cmg8_2, %gm_out_m0_cmg8_2, %c64_i64_m0_cmg8_2 + nburst(%c1_i64_m0_cmg8_2, %c64_i64_m0_cmg8_2, %c64_i64_m0_cmg8_2) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + + } + // active merged from pdintlv_b8_kernel_2d + + %c0_m1_cmg8_2 = arith.constant 0 : index + %c1_m1_cmg8_2 = arith.constant 1 : index + %c32_m1_cmg8_2 = arith.constant 32 : index + %c16_m1_cmg8_2 = arith.constant 16 : index + %c0_i64_m1_cmg8_2 = arith.constant 0 : i64 + %c1_i64_m1_cmg8_2 = arith.constant 1 : i64 + %c64_i64_m1_cmg8_2 = arith.constant 64 : i64 + + %ub_out_m1_cmg8_2 = pto.castptr %c0_i64_m1_cmg8_2 : i64 -> !pto.ptr + %gm_out_m1_cmg8_2 = pto.castptr %arg1 : !pto.ptr -> !pto.ptr + + pto.vecscope { + scf.for %iter_m1_cmg8_2 = %c0_m1_cmg8_2 to %c1_m1_cmg8_2 step %c1_m1_cmg8_2 { + %lhs_m1_cmg8_2 = pto.pset_b8 "PAT_ALL" : !pto.mask + %rhs_m1_cmg8_2 = pto.pset_b8 "PAT_ALLF" : !pto.mask + %low_m1_cmg8_2, %high_m1_cmg8_2 = pto.pdintlv_b8 %lhs_m1_cmg8_2, %rhs_m1_cmg8_2 : !pto.mask, !pto.mask -> !pto.mask, !pto.mask + pto.psts %low_m1_cmg8_2, %ub_out_m1_cmg8_2[%c0_m1_cmg8_2], "NORM" : !pto.mask, !pto.ptr, index + pto.psts %high_m1_cmg8_2, %ub_out_m1_cmg8_2[%c32_m1_cmg8_2], "NORM" : !pto.mask, !pto.ptr, index + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m1_cmg8_2, %gm_out_m1_cmg8_2, %c64_i64_m1_cmg8_2 + nburst(%c1_i64_m1_cmg8_2, %c64_i64_m1_cmg8_2, %c64_i64_m1_cmg8_2) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + } + + // inactive cross-case merge from micro-op/materialization-predicate/pge-tail-mask + scf.if %__case_merge_guard { + + %__deep_merge_guard_cmg8_3 = arith.constant false + // inactive merged from pge_tail_mask_boundary_kernel_2d + scf.if %__deep_merge_guard_cmg8_3 { + + %c0_m0_cmg8_3 = arith.constant 0 : index + %c1_m0_cmg8_3 = arith.constant 1 : index + %c32_m0_cmg8_3 = arith.constant 32 : index + %c64_m0_cmg8_3 = arith.constant 64 : index + %c0_i64_m0_cmg8_3 = arith.constant 0 : i64 + %c1_i64_m0_cmg8_3 = arith.constant 1 : i64 + %c96_i64_m0_cmg8_3 = arith.constant 96 : i64 + + %ub_out_m0_cmg8_3 = pto.castptr %c0_i64_m0_cmg8_3 : i64 -> !pto.ptr + %gm_out_m0_cmg8_3 = pto.castptr %arg0 : !pto.ptr -> !pto.ptr + + pto.vecscope { + scf.for %iter_m0_cmg8_3 = %c0_m0_cmg8_3 to %c1_m0_cmg8_3 step %c1_m0_cmg8_3 { + %m0_m0_cmg8_3 = pto.pge_b8 "PAT_VL1" : !pto.mask + %m1_m0_cmg8_3 = pto.pge_b16 "PAT_VL1" : !pto.mask + %m2_m0_cmg8_3 = pto.pge_b32 "PAT_VL1" : !pto.mask + pto.psts %m0_m0_cmg8_3, %ub_out_m0_cmg8_3[%c0_m0_cmg8_3], "NORM" : !pto.mask, !pto.ptr, index + pto.psts %m1_m0_cmg8_3, %ub_out_m0_cmg8_3[%c32_m0_cmg8_3], "NORM" : !pto.mask, !pto.ptr, index + pto.psts %m2_m0_cmg8_3, %ub_out_m0_cmg8_3[%c64_m0_cmg8_3], "NORM" : !pto.mask, !pto.ptr, index + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m0_cmg8_3, %gm_out_m0_cmg8_3, %c96_i64_m0_cmg8_3 + nburst(%c1_i64_m0_cmg8_3, %c96_i64_m0_cmg8_3, %c96_i64_m0_cmg8_3) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + + } + // active merged from pge_tail_mask_kernel_2d + + %c0_m1_cmg8_3 = arith.constant 0 : index + %c1_m1_cmg8_3 = arith.constant 1 : index + %c32_m1_cmg8_3 = arith.constant 32 : index + %c64_m1_cmg8_3 = arith.constant 64 : index + %c0_i64_m1_cmg8_3 = arith.constant 0 : i64 + %c1_i64_m1_cmg8_3 = arith.constant 1 : i64 + %c96_i64_m1_cmg8_3 = arith.constant 96 : i64 + + %ub_out_m1_cmg8_3 = pto.castptr %c0_i64_m1_cmg8_3 : i64 -> !pto.ptr + %gm_out_m1_cmg8_3 = pto.castptr %arg1 : !pto.ptr -> !pto.ptr + + pto.vecscope { + scf.for %iter_m1_cmg8_3 = %c0_m1_cmg8_3 to %c1_m1_cmg8_3 step %c1_m1_cmg8_3 { + %m0_m1_cmg8_3 = pto.pge_b8 "PAT_VL8" : !pto.mask + %m1_m1_cmg8_3 = pto.pge_b16 "PAT_VL8" : !pto.mask + %m2_m1_cmg8_3 = pto.pge_b32 "PAT_VL8" : !pto.mask + pto.psts %m0_m1_cmg8_3, %ub_out_m1_cmg8_3[%c0_m1_cmg8_3], "NORM" : !pto.mask, !pto.ptr, index + pto.psts %m1_m1_cmg8_3, %ub_out_m1_cmg8_3[%c32_m1_cmg8_3], "NORM" : !pto.mask, !pto.ptr, index + pto.psts %m2_m1_cmg8_3, %ub_out_m1_cmg8_3[%c64_m1_cmg8_3], "NORM" : !pto.mask, !pto.ptr, index + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m1_cmg8_3, %gm_out_m1_cmg8_3, %c96_i64_m1_cmg8_3 + nburst(%c1_i64_m1_cmg8_3, %c96_i64_m1_cmg8_3, %c96_i64_m1_cmg8_3) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + } + + // inactive cross-case merge from micro-op/materialization-predicate/pintlv_b16 + scf.if %__case_merge_guard { + + %__deep_merge_guard_cmg8_4 = arith.constant false + // inactive merged from pintlv_b16_nontrivial_kernel_2d + scf.if %__deep_merge_guard_cmg8_4 { + + %c0_m0_cmg8_4 = arith.constant 0 : index + %c1_m0_cmg8_4 = arith.constant 1 : index + %c32_m0_cmg8_4 = arith.constant 32 : index + %c16_m0_cmg8_4 = arith.constant 16 : index + %c0_i64_m0_cmg8_4 = arith.constant 0 : i64 + %c1_i64_m0_cmg8_4 = arith.constant 1 : i64 + %c64_i64_m0_cmg8_4 = arith.constant 64 : i64 + + %ub_out_m0_cmg8_4 = pto.castptr %c0_i64_m0_cmg8_4 : i64 -> !pto.ptr + %gm_out_m0_cmg8_4 = pto.castptr %arg0 : !pto.ptr -> !pto.ptr + + pto.vecscope { + scf.for %iter_m0_cmg8_4 = %c0_m0_cmg8_4 to %c1_m0_cmg8_4 step %c1_m0_cmg8_4 { + %lhs_m0_cmg8_4 = pto.pset_b16 "PAT_VL8" : !pto.mask + %rhs_m0_cmg8_4 = pto.pset_b16 "PAT_M4" : !pto.mask + %low_m0_cmg8_4, %high_m0_cmg8_4 = pto.pintlv_b16 %lhs_m0_cmg8_4, %rhs_m0_cmg8_4 : !pto.mask, !pto.mask -> !pto.mask, !pto.mask + pto.psts %low_m0_cmg8_4, %ub_out_m0_cmg8_4[%c0_m0_cmg8_4], "NORM" : !pto.mask, !pto.ptr, index + pto.psts %high_m0_cmg8_4, %ub_out_m0_cmg8_4[%c32_m0_cmg8_4], "NORM" : !pto.mask, !pto.ptr, index + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m0_cmg8_4, %gm_out_m0_cmg8_4, %c64_i64_m0_cmg8_4 + nburst(%c1_i64_m0_cmg8_4, %c64_i64_m0_cmg8_4, %c64_i64_m0_cmg8_4) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + + } + // active merged from pintlv_b16_kernel_2d + + %c0_m1_cmg8_4 = arith.constant 0 : index + %c1_m1_cmg8_4 = arith.constant 1 : index + %c32_m1_cmg8_4 = arith.constant 32 : index + %c16_m1_cmg8_4 = arith.constant 16 : index + %c0_i64_m1_cmg8_4 = arith.constant 0 : i64 + %c1_i64_m1_cmg8_4 = arith.constant 1 : i64 + %c64_i64_m1_cmg8_4 = arith.constant 64 : i64 + + %ub_out_m1_cmg8_4 = pto.castptr %c0_i64_m1_cmg8_4 : i64 -> !pto.ptr + %gm_out_m1_cmg8_4 = pto.castptr %arg1 : !pto.ptr -> !pto.ptr + + pto.vecscope { + scf.for %iter_m1_cmg8_4 = %c0_m1_cmg8_4 to %c1_m1_cmg8_4 step %c1_m1_cmg8_4 { + %lhs_m1_cmg8_4 = pto.pset_b16 "PAT_ALL" : !pto.mask + %rhs_m1_cmg8_4 = pto.pset_b16 "PAT_ALLF" : !pto.mask + %low_m1_cmg8_4, %high_m1_cmg8_4 = pto.pintlv_b16 %lhs_m1_cmg8_4, %rhs_m1_cmg8_4 : !pto.mask, !pto.mask -> !pto.mask, !pto.mask + pto.psts %low_m1_cmg8_4, %ub_out_m1_cmg8_4[%c0_m1_cmg8_4], "NORM" : !pto.mask, !pto.ptr, index + pto.psts %high_m1_cmg8_4, %ub_out_m1_cmg8_4[%c32_m1_cmg8_4], "NORM" : !pto.mask, !pto.ptr, index + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m1_cmg8_4, %gm_out_m1_cmg8_4, %c64_i64_m1_cmg8_4 + nburst(%c1_i64_m1_cmg8_4, %c64_i64_m1_cmg8_4, %c64_i64_m1_cmg8_4) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + } + + // inactive cross-case merge from micro-op/materialization-predicate/pintlv_b32 + scf.if %__case_merge_guard { + + %__deep_merge_guard_cmg8_5 = arith.constant false + // inactive merged from pintlv_b32_nontrivial_kernel_2d + scf.if %__deep_merge_guard_cmg8_5 { + + %c0_m0_cmg8_5 = arith.constant 0 : index + %c1_m0_cmg8_5 = arith.constant 1 : index + %c32_m0_cmg8_5 = arith.constant 32 : index + %c16_m0_cmg8_5 = arith.constant 16 : index + %c0_i64_m0_cmg8_5 = arith.constant 0 : i64 + %c1_i64_m0_cmg8_5 = arith.constant 1 : i64 + %c64_i64_m0_cmg8_5 = arith.constant 64 : i64 + + %ub_out_m0_cmg8_5 = pto.castptr %c0_i64_m0_cmg8_5 : i64 -> !pto.ptr + %gm_out_m0_cmg8_5 = pto.castptr %arg0 : !pto.ptr -> !pto.ptr + + pto.vecscope { + scf.for %iter_m0_cmg8_5 = %c0_m0_cmg8_5 to %c1_m0_cmg8_5 step %c1_m0_cmg8_5 { + %lhs_m0_cmg8_5 = pto.pset_b32 "PAT_VL8" : !pto.mask + %rhs_m0_cmg8_5 = pto.pset_b32 "PAT_M4" : !pto.mask + %low_m0_cmg8_5, %high_m0_cmg8_5 = pto.pintlv_b32 %lhs_m0_cmg8_5, %rhs_m0_cmg8_5 : !pto.mask, !pto.mask -> !pto.mask, !pto.mask + pto.psts %low_m0_cmg8_5, %ub_out_m0_cmg8_5[%c0_m0_cmg8_5], "NORM" : !pto.mask, !pto.ptr, index + pto.psts %high_m0_cmg8_5, %ub_out_m0_cmg8_5[%c32_m0_cmg8_5], "NORM" : !pto.mask, !pto.ptr, index + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m0_cmg8_5, %gm_out_m0_cmg8_5, %c64_i64_m0_cmg8_5 + nburst(%c1_i64_m0_cmg8_5, %c64_i64_m0_cmg8_5, %c64_i64_m0_cmg8_5) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + + } + // active merged from pintlv_b32_kernel_2d + + %c0_m1_cmg8_5 = arith.constant 0 : index + %c1_m1_cmg8_5 = arith.constant 1 : index + %c32_m1_cmg8_5 = arith.constant 32 : index + %c16_m1_cmg8_5 = arith.constant 16 : index + %c0_i64_m1_cmg8_5 = arith.constant 0 : i64 + %c1_i64_m1_cmg8_5 = arith.constant 1 : i64 + %c64_i64_m1_cmg8_5 = arith.constant 64 : i64 + + %ub_out_m1_cmg8_5 = pto.castptr %c0_i64_m1_cmg8_5 : i64 -> !pto.ptr + %gm_out_m1_cmg8_5 = pto.castptr %arg1 : !pto.ptr -> !pto.ptr + + pto.vecscope { + scf.for %iter_m1_cmg8_5 = %c0_m1_cmg8_5 to %c1_m1_cmg8_5 step %c1_m1_cmg8_5 { + %lhs_m1_cmg8_5 = pto.pset_b32 "PAT_ALL" : !pto.mask + %rhs_m1_cmg8_5 = pto.pset_b32 "PAT_ALLF" : !pto.mask + %low_m1_cmg8_5, %high_m1_cmg8_5 = pto.pintlv_b32 %lhs_m1_cmg8_5, %rhs_m1_cmg8_5 : !pto.mask, !pto.mask -> !pto.mask, !pto.mask + pto.psts %low_m1_cmg8_5, %ub_out_m1_cmg8_5[%c0_m1_cmg8_5], "NORM" : !pto.mask, !pto.ptr, index + pto.psts %high_m1_cmg8_5, %ub_out_m1_cmg8_5[%c32_m1_cmg8_5], "NORM" : !pto.mask, !pto.ptr, index + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m1_cmg8_5, %gm_out_m1_cmg8_5, %c64_i64_m1_cmg8_5 + nburst(%c1_i64_m1_cmg8_5, %c64_i64_m1_cmg8_5, %c64_i64_m1_cmg8_5) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + } + + // inactive cross-case merge from micro-op/materialization-predicate/pintlv_b8 + scf.if %__case_merge_guard { + + %__deep_merge_guard_cmg8_6 = arith.constant false + // inactive merged from pintlv_b8_nontrivial_kernel_2d + scf.if %__deep_merge_guard_cmg8_6 { + + %c0_m0_cmg8_6 = arith.constant 0 : index + %c1_m0_cmg8_6 = arith.constant 1 : index + %c32_m0_cmg8_6 = arith.constant 32 : index + %c16_m0_cmg8_6 = arith.constant 16 : index + %c0_i64_m0_cmg8_6 = arith.constant 0 : i64 + %c1_i64_m0_cmg8_6 = arith.constant 1 : i64 + %c64_i64_m0_cmg8_6 = arith.constant 64 : i64 + + %ub_out_m0_cmg8_6 = pto.castptr %c0_i64_m0_cmg8_6 : i64 -> !pto.ptr + %gm_out_m0_cmg8_6 = pto.castptr %arg0 : !pto.ptr -> !pto.ptr + + pto.vecscope { + scf.for %iter_m0_cmg8_6 = %c0_m0_cmg8_6 to %c1_m0_cmg8_6 step %c1_m0_cmg8_6 { + %lhs_m0_cmg8_6 = pto.pset_b8 "PAT_VL8" : !pto.mask + %rhs_m0_cmg8_6 = pto.pset_b8 "PAT_M4" : !pto.mask + %low_m0_cmg8_6, %high_m0_cmg8_6 = pto.pintlv_b8 %lhs_m0_cmg8_6, %rhs_m0_cmg8_6 : !pto.mask, !pto.mask -> !pto.mask, !pto.mask + pto.psts %low_m0_cmg8_6, %ub_out_m0_cmg8_6[%c0_m0_cmg8_6], "NORM" : !pto.mask, !pto.ptr, index + pto.psts %high_m0_cmg8_6, %ub_out_m0_cmg8_6[%c32_m0_cmg8_6], "NORM" : !pto.mask, !pto.ptr, index + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m0_cmg8_6, %gm_out_m0_cmg8_6, %c64_i64_m0_cmg8_6 + nburst(%c1_i64_m0_cmg8_6, %c64_i64_m0_cmg8_6, %c64_i64_m0_cmg8_6) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + + } + // active merged from pintlv_b8_kernel_2d + + %c0_m1_cmg8_6 = arith.constant 0 : index + %c1_m1_cmg8_6 = arith.constant 1 : index + %c32_m1_cmg8_6 = arith.constant 32 : index + %c16_m1_cmg8_6 = arith.constant 16 : index + %c0_i64_m1_cmg8_6 = arith.constant 0 : i64 + %c1_i64_m1_cmg8_6 = arith.constant 1 : i64 + %c64_i64_m1_cmg8_6 = arith.constant 64 : i64 + + %ub_out_m1_cmg8_6 = pto.castptr %c0_i64_m1_cmg8_6 : i64 -> !pto.ptr + %gm_out_m1_cmg8_6 = pto.castptr %arg1 : !pto.ptr -> !pto.ptr + + pto.vecscope { + scf.for %iter_m1_cmg8_6 = %c0_m1_cmg8_6 to %c1_m1_cmg8_6 step %c1_m1_cmg8_6 { + %lhs_m1_cmg8_6 = pto.pset_b8 "PAT_ALL" : !pto.mask + %rhs_m1_cmg8_6 = pto.pset_b8 "PAT_ALLF" : !pto.mask + %low_m1_cmg8_6, %high_m1_cmg8_6 = pto.pintlv_b8 %lhs_m1_cmg8_6, %rhs_m1_cmg8_6 : !pto.mask, !pto.mask -> !pto.mask, !pto.mask + pto.psts %low_m1_cmg8_6, %ub_out_m1_cmg8_6[%c0_m1_cmg8_6], "NORM" : !pto.mask, !pto.ptr, index + pto.psts %high_m1_cmg8_6, %ub_out_m1_cmg8_6[%c32_m1_cmg8_6], "NORM" : !pto.mask, !pto.ptr, index + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m1_cmg8_6, %gm_out_m1_cmg8_6, %c64_i64_m1_cmg8_6 + nburst(%c1_i64_m1_cmg8_6, %c64_i64_m1_cmg8_6, %c64_i64_m1_cmg8_6) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + } + + // inactive cross-case merge from micro-op/materialization-predicate/plt-tail-mask + scf.if %__case_merge_guard { + + %__deep_merge_guard_cmg8_7 = arith.constant false + // inactive merged from plt_tail_mask_boundary_kernel_2d + scf.if %__deep_merge_guard_cmg8_7 { + + %c0_m0_cmg8_7 = arith.constant 0 : index + %c1_m0_cmg8_7 = arith.constant 1 : index + %c32_m0_cmg8_7 = arith.constant 32 : index + %c64_m0_cmg8_7 = arith.constant 64 : index + %c0_i64_m0_cmg8_7 = arith.constant 0 : i64 + %c1_i64_m0_cmg8_7 = arith.constant 1 : i64 + %c96_i64_m0_cmg8_7 = arith.constant 96 : i64 + %c1_i32_m0_cmg8_7 = arith.constant 1 : i32 + + %ub_out_m0_cmg8_7 = pto.castptr %c0_i64_m0_cmg8_7 : i64 -> !pto.ptr + %gm_out_m0_cmg8_7 = pto.castptr %arg0 : !pto.ptr -> !pto.ptr + + pto.vecscope { + scf.for %iter_m0_cmg8_7 = %c0_m0_cmg8_7 to %c1_m0_cmg8_7 step %c1_m0_cmg8_7 { + %m0_m0_cmg8_7, %s0_m0_cmg8_7 = pto.plt_b8 %c1_i32_m0_cmg8_7 : i32 -> !pto.mask, i32 + %m1_m0_cmg8_7, %s1_m0_cmg8_7 = pto.plt_b16 %c1_i32_m0_cmg8_7 : i32 -> !pto.mask, i32 + %m2_m0_cmg8_7, %s2_m0_cmg8_7 = pto.plt_b32 %c1_i32_m0_cmg8_7 : i32 -> !pto.mask, i32 + pto.psts %m0_m0_cmg8_7, %ub_out_m0_cmg8_7[%c0_m0_cmg8_7], "NORM" : !pto.mask, !pto.ptr, index + pto.psts %m1_m0_cmg8_7, %ub_out_m0_cmg8_7[%c32_m0_cmg8_7], "NORM" : !pto.mask, !pto.ptr, index + pto.psts %m2_m0_cmg8_7, %ub_out_m0_cmg8_7[%c64_m0_cmg8_7], "NORM" : !pto.mask, !pto.ptr, index + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m0_cmg8_7, %gm_out_m0_cmg8_7, %c96_i64_m0_cmg8_7 + nburst(%c1_i64_m0_cmg8_7, %c96_i64_m0_cmg8_7, %c96_i64_m0_cmg8_7) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + + } + // active merged from plt_tail_mask_kernel_2d + + %c0_m1_cmg8_7 = arith.constant 0 : index + %c1_m1_cmg8_7 = arith.constant 1 : index + %c32_m1_cmg8_7 = arith.constant 32 : index + %c64_m1_cmg8_7 = arith.constant 64 : index + %c0_i64_m1_cmg8_7 = arith.constant 0 : i64 + %c1_i64_m1_cmg8_7 = arith.constant 1 : i64 + %c96_i64_m1_cmg8_7 = arith.constant 96 : i64 + %c13_m1_cmg8_7 = arith.constant 13 : i32 + %c7_m1_cmg8_7 = arith.constant 7 : i32 + %c3_m1_cmg8_7 = arith.constant 3 : i32 + + %ub_out_m1_cmg8_7 = pto.castptr %c0_i64_m1_cmg8_7 : i64 -> !pto.ptr + %gm_out_m1_cmg8_7 = pto.castptr %arg1 : !pto.ptr -> !pto.ptr + + pto.vecscope { + scf.for %iter_m1_cmg8_7 = %c0_m1_cmg8_7 to %c1_m1_cmg8_7 step %c1_m1_cmg8_7 { + %m0_m1_cmg8_7, %s0_m1_cmg8_7 = pto.plt_b8 %c13_m1_cmg8_7 : i32 -> !pto.mask, i32 + %m1_m1_cmg8_7, %s1_m1_cmg8_7 = pto.plt_b16 %c7_m1_cmg8_7 : i32 -> !pto.mask, i32 + %m2_m1_cmg8_7, %s2_m1_cmg8_7 = pto.plt_b32 %c3_m1_cmg8_7 : i32 -> !pto.mask, i32 + pto.psts %m0_m1_cmg8_7, %ub_out_m1_cmg8_7[%c0_m1_cmg8_7], "NORM" : !pto.mask, !pto.ptr, index + pto.psts %m1_m1_cmg8_7, %ub_out_m1_cmg8_7[%c32_m1_cmg8_7], "NORM" : !pto.mask, !pto.ptr, index + pto.psts %m2_m1_cmg8_7, %ub_out_m1_cmg8_7[%c64_m1_cmg8_7], "NORM" : !pto.mask, !pto.ptr, index + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m1_cmg8_7, %gm_out_m1_cmg8_7, %c96_i64_m1_cmg8_7 + nburst(%c1_i64_m1_cmg8_7, %c96_i64_m1_cmg8_7, %c96_i64_m1_cmg8_7) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + } + + // inactive cross-case merge from micro-op/materialization-predicate/ppack-punpack + scf.if %__case_merge_guard { + + %__deep_merge_guard_cmg8_8 = arith.constant false + // inactive merged from ppack_punpack_nontrivial_kernel_2d + scf.if %__deep_merge_guard_cmg8_8 { + + %c0_m0_cmg8_8 = arith.constant 0 : index + %c1_m0_cmg8_8 = arith.constant 1 : index + %c32_m0_cmg8_8 = arith.constant 32 : index + %c0_i64_m0_cmg8_8 = arith.constant 0 : i64 + %c1_i64_m0_cmg8_8 = arith.constant 1 : i64 + %c64_i64_m0_cmg8_8 = arith.constant 64 : i64 + + %ub_out_m0_cmg8_8 = pto.castptr %c0_i64_m0_cmg8_8 : i64 -> !pto.ptr + %gm_out_m0_cmg8_8 = pto.castptr %arg0 : !pto.ptr -> !pto.ptr + + pto.vecscope { + scf.for %iter_m0_cmg8_8 = %c0_m0_cmg8_8 to %c1_m0_cmg8_8 step %c1_m0_cmg8_8 { + %src_m0_cmg8_8 = pto.pset_b32 "PAT_M4" : !pto.mask + %packed_m0_cmg8_8 = pto.ppack %src_m0_cmg8_8, "LOWER" : !pto.mask -> !pto.mask + %roundtrip_m0_cmg8_8 = pto.punpack %packed_m0_cmg8_8, "LOWER" : !pto.mask -> !pto.mask + pto.psts %packed_m0_cmg8_8, %ub_out_m0_cmg8_8[%c0_m0_cmg8_8], "NORM" : !pto.mask, !pto.ptr, index + pto.psts %roundtrip_m0_cmg8_8, %ub_out_m0_cmg8_8[%c32_m0_cmg8_8], "NORM" : !pto.mask, !pto.ptr, index + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m0_cmg8_8, %gm_out_m0_cmg8_8, %c64_i64_m0_cmg8_8 + nburst(%c1_i64_m0_cmg8_8, %c64_i64_m0_cmg8_8, %c64_i64_m0_cmg8_8) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + + } + // active merged from ppack_punpack_kernel_2d + + %c0_m1_cmg8_8 = arith.constant 0 : index + %c1_m1_cmg8_8 = arith.constant 1 : index + %c32_m1_cmg8_8 = arith.constant 32 : index + %c0_i64_m1_cmg8_8 = arith.constant 0 : i64 + %c1_i64_m1_cmg8_8 = arith.constant 1 : i64 + %c64_i64_m1_cmg8_8 = arith.constant 64 : i64 + + %ub_out_m1_cmg8_8 = pto.castptr %c0_i64_m1_cmg8_8 : i64 -> !pto.ptr + %gm_out_m1_cmg8_8 = pto.castptr %arg1 : !pto.ptr -> !pto.ptr + + pto.vecscope { + scf.for %iter_m1_cmg8_8 = %c0_m1_cmg8_8 to %c1_m1_cmg8_8 step %c1_m1_cmg8_8 { + %src_m1_cmg8_8 = pto.pset_b32 "PAT_ALL" : !pto.mask + %packed_m1_cmg8_8 = pto.ppack %src_m1_cmg8_8, "LOWER" : !pto.mask -> !pto.mask + %roundtrip_m1_cmg8_8 = pto.punpack %packed_m1_cmg8_8, "LOWER" : !pto.mask -> !pto.mask + pto.psts %packed_m1_cmg8_8, %ub_out_m1_cmg8_8[%c0_m1_cmg8_8], "NORM" : !pto.mask, !pto.ptr, index + pto.psts %roundtrip_m1_cmg8_8, %ub_out_m1_cmg8_8[%c32_m1_cmg8_8], "NORM" : !pto.mask, !pto.ptr, index + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m1_cmg8_8, %gm_out_m1_cmg8_8, %c64_i64_m1_cmg8_8 + nburst(%c1_i64_m1_cmg8_8, %c64_i64_m1_cmg8_8, %c64_i64_m1_cmg8_8) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + } + + // inactive cross-case merge from micro-op/materialization-predicate/psel + scf.if %__case_merge_guard { + + %__deep_merge_guard_cmg8_9 = arith.constant false + // inactive merged from psel_tail_kernel_2d + scf.if %__deep_merge_guard_cmg8_9 { + + %c0_m0_cmg8_9 = arith.constant 0 : index + %c1_m0_cmg8_9 = arith.constant 1 : index + %c32_m0_cmg8_9 = arith.constant 32 : index + %c64_m0_cmg8_9 = arith.constant 64 : index + %c0_i64_m0_cmg8_9 = arith.constant 0 : i64 + %c1_i64_m0_cmg8_9 = arith.constant 1 : i64 + %c32_i64_m0_cmg8_9 = arith.constant 32 : i64 + %c64_i64_m0_cmg8_9 = arith.constant 64 : i64 + %c13_m0_cmg8_9 = arith.constant 13 : i32 + + %ub_out_m0_cmg8_9 = pto.castptr %c0_i64_m0_cmg8_9 : i64 -> !pto.ptr + %gm_out_m0_cmg8_9 = pto.castptr %arg0 : !pto.ptr -> !pto.ptr + + pto.vecscope { + scf.for %iter_m0_cmg8_9 = %c0_m0_cmg8_9 to %c1_m0_cmg8_9 step %c1_m0_cmg8_9 { + %src0_m0_cmg8_9 = pto.pset_b32 "PAT_ALL" : !pto.mask + %sel_m0_cmg8_9, %next_m0_cmg8_9 = pto.plt_b32 %c13_m0_cmg8_9 : i32 -> !pto.mask, i32 + %out_m0_cmg8_9 = pto.psel %src0_m0_cmg8_9, %sel_m0_cmg8_9, %sel_m0_cmg8_9 : !pto.mask, !pto.mask, !pto.mask -> !pto.mask + pto.psts %out_m0_cmg8_9, %ub_out_m0_cmg8_9[%c0_m0_cmg8_9], "NORM" : !pto.mask, !pto.ptr, index + %out_next_m0_cmg8_9 = pto.psel %sel_m0_cmg8_9, %src0_m0_cmg8_9, %sel_m0_cmg8_9 : !pto.mask, !pto.mask, !pto.mask -> !pto.mask + pto.psts %out_next_m0_cmg8_9, %ub_out_m0_cmg8_9[%c32_m0_cmg8_9], "NORM" : !pto.mask, !pto.ptr, index + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m0_cmg8_9, %gm_out_m0_cmg8_9, %c64_i64_m0_cmg8_9 + nburst(%c1_i64_m0_cmg8_9, %c64_i64_m0_cmg8_9, %c64_i64_m0_cmg8_9) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + + } + // active merged from psel_kernel_2d + + %c0_m1_cmg8_9 = arith.constant 0 : index + %c1_m1_cmg8_9 = arith.constant 1 : index + %c32_m1_cmg8_9 = arith.constant 32 : index + %c0_i64_m1_cmg8_9 = arith.constant 0 : i64 + %c1_i64_m1_cmg8_9 = arith.constant 1 : i64 + %c32_i64_m1_cmg8_9 = arith.constant 32 : i64 + %c13_m1_cmg8_9 = arith.constant 13 : i32 + + %ub_out_m1_cmg8_9 = pto.castptr %c0_i64_m1_cmg8_9 : i64 -> !pto.ptr + %gm_out_m1_cmg8_9 = pto.castptr %arg1 : !pto.ptr -> !pto.ptr + + pto.vecscope { + scf.for %iter_m1_cmg8_9 = %c0_m1_cmg8_9 to %c1_m1_cmg8_9 step %c1_m1_cmg8_9 { + %src0_m1_cmg8_9 = pto.pset_b32 "PAT_ALL" : !pto.mask + %sel_m1_cmg8_9, %next_m1_cmg8_9 = pto.plt_b32 %c13_m1_cmg8_9 : i32 -> !pto.mask, i32 + %out_m1_cmg8_9 = pto.psel %src0_m1_cmg8_9, %sel_m1_cmg8_9, %sel_m1_cmg8_9 : !pto.mask, !pto.mask, !pto.mask -> !pto.mask + pto.psts %out_m1_cmg8_9, %ub_out_m1_cmg8_9[%c0_m1_cmg8_9], "NORM" : !pto.mask, !pto.ptr, index + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m1_cmg8_9, %gm_out_m1_cmg8_9, %c32_i64_m1_cmg8_9 + nburst(%c1_i64_m1_cmg8_9, %c32_i64_m1_cmg8_9, %c32_i64_m1_cmg8_9) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + } + + // inactive cross-case merge from micro-op/materialization-predicate/pset-pattern + scf.if %__case_merge_guard { + + %__deep_merge_guard_cmg8_10 = arith.constant false + // inactive merged from pset_pattern_fragment_kernel_2d + scf.if %__deep_merge_guard_cmg8_10 { + + %c0_m0_cmg8_10 = arith.constant 0 : index + %c1_m0_cmg8_10 = arith.constant 1 : index + %c32_m0_cmg8_10 = arith.constant 32 : index + %c64_m0_cmg8_10 = arith.constant 64 : index + %c0_i64_m0_cmg8_10 = arith.constant 0 : i64 + %c1_i64_m0_cmg8_10 = arith.constant 1 : i64 + %c96_i64_m0_cmg8_10 = arith.constant 96 : i64 + + %ub_out_m0_cmg8_10 = pto.castptr %c0_i64_m0_cmg8_10 : i64 -> !pto.ptr + %gm_out_m0_cmg8_10 = pto.castptr %arg0 : !pto.ptr -> !pto.ptr + + pto.vecscope { + scf.for %iter_m0_cmg8_10 = %c0_m0_cmg8_10 to %c1_m0_cmg8_10 step %c1_m0_cmg8_10 { + %m0_m0_cmg8_10 = pto.pset_b8 "PAT_M3" : !pto.mask + %m1_m0_cmg8_10 = pto.pset_b16 "PAT_H" : !pto.mask + %m2_m0_cmg8_10 = pto.pset_b32 "PAT_Q" : !pto.mask + pto.psts %m0_m0_cmg8_10, %ub_out_m0_cmg8_10[%c0_m0_cmg8_10], "NORM" : !pto.mask, !pto.ptr, index + pto.psts %m1_m0_cmg8_10, %ub_out_m0_cmg8_10[%c32_m0_cmg8_10], "NORM" : !pto.mask, !pto.ptr, index + pto.psts %m2_m0_cmg8_10, %ub_out_m0_cmg8_10[%c64_m0_cmg8_10], "NORM" : !pto.mask, !pto.ptr, index + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m0_cmg8_10, %gm_out_m0_cmg8_10, %c96_i64_m0_cmg8_10 + nburst(%c1_i64_m0_cmg8_10, %c96_i64_m0_cmg8_10, %c96_i64_m0_cmg8_10) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + + } + // active merged from pset_pattern_kernel_2d + + %c0_m1_cmg8_10 = arith.constant 0 : index + %c1_m1_cmg8_10 = arith.constant 1 : index + %c32_m1_cmg8_10 = arith.constant 32 : index + %c64_m1_cmg8_10 = arith.constant 64 : index + %c0_i64_m1_cmg8_10 = arith.constant 0 : i64 + %c1_i64_m1_cmg8_10 = arith.constant 1 : i64 + %c96_i64_m1_cmg8_10 = arith.constant 96 : i64 + + %ub_out_m1_cmg8_10 = pto.castptr %c0_i64_m1_cmg8_10 : i64 -> !pto.ptr + %gm_out_m1_cmg8_10 = pto.castptr %arg1 : !pto.ptr -> !pto.ptr + + pto.vecscope { + scf.for %iter_m1_cmg8_10 = %c0_m1_cmg8_10 to %c1_m1_cmg8_10 step %c1_m1_cmg8_10 { + %m0_m1_cmg8_10 = pto.pset_b8 "PAT_ALL" : !pto.mask + %m1_m1_cmg8_10 = pto.pset_b16 "PAT_VL8" : !pto.mask + %m2_m1_cmg8_10 = pto.pset_b32 "PAT_VL16" : !pto.mask + pto.psts %m0_m1_cmg8_10, %ub_out_m1_cmg8_10[%c0_m1_cmg8_10], "NORM" : !pto.mask, !pto.ptr, index + pto.psts %m1_m1_cmg8_10, %ub_out_m1_cmg8_10[%c32_m1_cmg8_10], "NORM" : !pto.mask, !pto.ptr, index + pto.psts %m2_m1_cmg8_10, %ub_out_m1_cmg8_10[%c64_m1_cmg8_10], "NORM" : !pto.mask, !pto.ptr, index + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m1_cmg8_10, %gm_out_m1_cmg8_10, %c96_i64_m1_cmg8_10 + nburst(%c1_i64_m1_cmg8_10, %c96_i64_m1_cmg8_10, %c96_i64_m1_cmg8_10) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + } + return } } diff --git a/test/vpto/cases/micro-op/materialization-predicate/pdintlv_b16/launch.cpp b/test/vpto/cases/micro-op/materialization-predicate/pdintlv_b16/launch.cpp index 519e90a51..ea30934e5 100644 --- a/test/vpto/cases/micro-op/materialization-predicate/pdintlv_b16/launch.cpp +++ b/test/vpto/cases/micro-op/materialization-predicate/pdintlv_b16/launch.cpp @@ -5,18 +5,9 @@ // THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, // INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. // See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/materialization-predicate/pdintlv_b16 -// family: materialization-predicate -// target_ops: pto.pdintlv_b16 -// scenarios: predicate-transform, lane-order -// --------------------------------------------------------------------------- -// PTOAS compatibility layer #ifndef __VEC_SCOPE__ #define __VEC_SCOPE__ #endif - #if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) typedef struct { unsigned char v; } hifloat8_t; typedef struct { unsigned char v; } float8_e4m3_t; @@ -30,19 +21,18 @@ typedef struct { unsigned char v; } float4_e2m1x2_t; #include #endif #if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; +struct MrgSortExecutedNumList { uint16_t mrgSortList0,mrgSortList1,mrgSortList2,mrgSortList3; }; #endif #ifndef __CPU_SIM #include "acl/acl.h" #endif -extern "C" __global__ [aicore] void pdintlv_b16_kernel_2d(__gm__ uint32_t *v1); +extern "C" __global__ [aicore] void pdintlv_b16_deep_merged_kernel( + __gm__ uint32_t * arg0, + __gm__ uint32_t * arg1); -void LaunchPdintlvB16(uint32_t *v1, void *stream) { - pdintlv_b16_kernel_2d<<<1, nullptr, stream>>>((__gm__ uint32_t *)v1); +void LaunchPdintlvB16DeepMerged(uint32_t * p0, void *stream) { + pdintlv_b16_deep_merged_kernel<<<1, nullptr, stream>>>( + (__gm__ uint32_t *)p0, + (__gm__ uint32_t *)p0); } diff --git a/test/vpto/cases/micro-op/materialization-predicate/pdintlv_b16/main.cpp b/test/vpto/cases/micro-op/materialization-predicate/pdintlv_b16/main.cpp index e2491af41..810bacbd6 100644 --- a/test/vpto/cases/micro-op/materialization-predicate/pdintlv_b16/main.cpp +++ b/test/vpto/cases/micro-op/materialization-predicate/pdintlv_b16/main.cpp @@ -40,8 +40,8 @@ struct MrgSortExecutedNumList { #define ACL_CHECK(expr) do { const aclError _ret = (expr); if (_ret != ACL_SUCCESS) { std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, (int)_ret, __FILE__, __LINE__); const char *_recent = aclGetRecentErrMsg(); if (_recent != nullptr && _recent[0] != '\0') { std::fprintf(stderr, "[ERROR] RecentErrMsg: %s\n", _recent); } rc = 1; goto cleanup; } } while (0) -void LaunchPdintlvB16(uint32_t *v1, void *stream); +void LaunchPdintlvB16DeepMerged(uint32_t * p0, void *stream); int main() { size_t elemCount_v1 = 32; size_t fileSize_v1 = elemCount_v1 * sizeof(uint32_t); @@ -68,7 +68,7 @@ int main() { ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchPdintlvB16(v1Device, stream); + LaunchPdintlvB16DeepMerged(v1Device, stream); ACL_CHECK(aclrtSynchronizeStream(stream)); ACL_CHECK(aclrtMemcpy(v1Host, fileSize_v1, v1Device, fileSize_v1, ACL_MEMCPY_DEVICE_TO_HOST)); diff --git a/test/vpto/cases/micro-op/materialization-predicate/pdintlv_b32-nontrivial/compare.py b/test/vpto/cases/micro-op/materialization-predicate/pdintlv_b32-nontrivial/compare.py deleted file mode 100755 index 13e93d501..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/pdintlv_b32-nontrivial/compare.py +++ /dev/null @@ -1,64 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/materialization-predicate/pdintlv_b32-nontrivial -# family: materialization-predicate -# target_ops: pto.pdintlv_b32 -# scenarios: predicate-transform, lane-order, representative-logical-elements -# coding=utf-8 - -import os -import sys - -import numpy as np - - -EXPECTED_WORDS = 32 -PREFIX_WORDS = 16 - - -def compare_words(golden_path, output_path): - if not os.path.exists(output_path) or not os.path.exists(golden_path): - print(f"[ERROR] Missing file: golden={golden_path} out={output_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint32) - output = np.fromfile(output_path, dtype=np.uint32) - if golden.size != EXPECTED_WORDS or output.size != EXPECTED_WORDS: - print( - f"[ERROR] Unexpected word count: golden={golden.size} " - f"out={output.size} expected={EXPECTED_WORDS}" - ) - return False - golden = golden[:PREFIX_WORDS] - output = output[:PREFIX_WORDS] - if not np.array_equal(golden, output): - diff = np.nonzero(golden != output)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed predicate words): idx={idx} " - f"golden={int(golden[idx])} out={int(output[idx])}" - ) - return False - return True - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_words("golden_v1.bin", "v1.bin") - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/materialization-predicate/pdintlv_b32-nontrivial/golden.py b/test/vpto/cases/micro-op/materialization-predicate/pdintlv_b32-nontrivial/golden.py deleted file mode 100755 index 013f7751e..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/pdintlv_b32-nontrivial/golden.py +++ /dev/null @@ -1,49 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/materialization-predicate/pdintlv_b32-nontrivial -# family: materialization-predicate -# target_ops: pto.pdintlv_b32 -# scenarios: predicate-transform, lane-order, representative-logical-elements -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -SEED = 19 -OUTPUT_WORDS = 32 -GOLDEN_PREFIX_WORDS = np.array([4369, 0, 0, 0, 16843009, 16843009, 16843009, 16843009, 4369, 0, 0, 0, 0, 0, 0, 0], dtype=np.uint32) - - -def generate(output_dir: Path, seed: int) -> None: - del seed - output_init = np.zeros((OUTPUT_WORDS,), dtype=np.uint32) - golden = np.zeros((OUTPUT_WORDS,), dtype=np.uint32) - golden[: GOLDEN_PREFIX_WORDS.size] = GOLDEN_PREFIX_WORDS - - output_dir.mkdir(parents=True, exist_ok=True) - output_init.tofile(output_dir / "v1.bin") - golden.tofile(output_dir / "golden_v1.bin") - - -def main() -> None: - parser = argparse.ArgumentParser( - description="Generate packed predicate golden for VPTO micro-op validation." - ) - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/materialization-predicate/pdintlv_b32-nontrivial/kernel.pto b/test/vpto/cases/micro-op/materialization-predicate/pdintlv_b32-nontrivial/kernel.pto deleted file mode 100644 index 815b3b373..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/pdintlv_b32-nontrivial/kernel.pto +++ /dev/null @@ -1,38 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/materialization-predicate/pdintlv_b32-nontrivial -// family: materialization-predicate -// target_ops: pto.pdintlv_b32 -// scenarios: predicate-transform, lane-order, nontrivial-pattern -// ----------------------------------------------------------------------------- -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @pdintlv_b32_nontrivial_kernel_2d(%arg0: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c32 = arith.constant 32 : index - %c16 = arith.constant 16 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c64_i64 = arith.constant 64 : i64 - - %ub_out = pto.castptr %c0_i64 : i64 -> !pto.ptr - %gm_out = pto.castptr %arg0 : !pto.ptr -> !pto.ptr - - pto.vecscope { - scf.for %iter = %c0 to %c1 step %c1 { - %lhs = pto.pset_b32 "PAT_VL8" : !pto.mask - %rhs = pto.pset_b32 "PAT_M4" : !pto.mask - %low, %high = pto.pdintlv_b32 %lhs, %rhs : !pto.mask, !pto.mask -> !pto.mask, !pto.mask - pto.psts %low, %ub_out[%c0], "NORM" : !pto.mask, !pto.ptr, index - pto.psts %high, %ub_out[%c32], "NORM" : !pto.mask, !pto.ptr, index - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %gm_out, %c64_i64 - nburst(%c1_i64, %c64_i64, %c64_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/materialization-predicate/pdintlv_b32-nontrivial/launch.cpp b/test/vpto/cases/micro-op/materialization-predicate/pdintlv_b32-nontrivial/launch.cpp deleted file mode 100644 index 9a6cd6a5e..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/pdintlv_b32-nontrivial/launch.cpp +++ /dev/null @@ -1,48 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/materialization-predicate/pdintlv_b32-nontrivial -// family: materialization-predicate -// target_ops: pto.pdintlv_b32 -// scenarios: predicate-transform, lane-order, nontrivial-pattern -// --------------------------------------------------------------------------- -// PTOAS compatibility layer -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void pdintlv_b32_nontrivial_kernel_2d(__gm__ uint32_t *v1); - -void LaunchPdintlvB32Nontrivial(uint32_t *v1, void *stream) { - pdintlv_b32_nontrivial_kernel_2d<<<1, nullptr, stream>>>((__gm__ uint32_t *)v1); -} diff --git a/test/vpto/cases/micro-op/materialization-predicate/pdintlv_b32-nontrivial/main.cpp b/test/vpto/cases/micro-op/materialization-predicate/pdintlv_b32-nontrivial/main.cpp deleted file mode 100644 index 97d56c906..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/pdintlv_b32-nontrivial/main.cpp +++ /dev/null @@ -1,104 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/materialization-predicate/pdintlv_b32-nontrivial -// family: materialization-predicate -// target_ops: pto.pdintlv_b32 -// scenarios: predicate-transform, lane-order, nontrivial-pattern -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -This program is free software, you can redistribute it and/or modify it under the terms and conditions of -CANN Open Software License Agreement Version 2.0 (the "License"). -Please refer to the License for details. You may not use this file except in compliance with the License. -THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -See LICENSE in the root of the software repository for the full text of the License. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include -#include - -using namespace PtoTestCommon; - -#ifndef TMRGSORT_HPP -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif - -#define ACL_CHECK(expr) do { const aclError _ret = (expr); if (_ret != ACL_SUCCESS) { std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, (int)_ret, __FILE__, __LINE__); const char *_recent = aclGetRecentErrMsg(); if (_recent != nullptr && _recent[0] != '\0') { std::fprintf(stderr, "[ERROR] RecentErrMsg: %s\n", _recent); } rc = 1; goto cleanup; } } while (0) - -void LaunchPdintlvB32Nontrivial(uint32_t *v1, void *stream); - -int main() { - size_t elemCount_v1 = 32; - size_t fileSize_v1 = elemCount_v1 * sizeof(uint32_t); - uint32_t *v1Host = nullptr; - uint32_t *v1Device = nullptr; - - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { - deviceId = std::atoi(envDevice); - } - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchPdintlvB32Nontrivial(v1Device, stream); - - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v1Host, fileSize_v1, v1Device, fileSize_v1, ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v1.bin", v1Host, fileSize_v1); - -cleanup: - aclrtFree(v1Device); - aclrtFreeHost(v1Host); - if (stream != nullptr) { - const aclError _ret = aclrtDestroyStream(stream); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtDestroyStream(stream)", (int)_ret, __FILE__, __LINE__); - } - stream = nullptr; - } - if (deviceSet) { - const aclError _ret = aclrtResetDevice(deviceId); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtResetDevice(deviceId)", (int)_ret, __FILE__, __LINE__); - } - } - if (aclInited) { - const aclError _ret = aclFinalize(); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclFinalize()", (int)_ret, __FILE__, __LINE__); - } - } - - return rc; -} diff --git a/test/vpto/cases/micro-op/materialization-predicate/pdintlv_b32/compare.py b/test/vpto/cases/micro-op/materialization-predicate/pdintlv_b32/compare.py deleted file mode 100755 index fab797df6..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/pdintlv_b32/compare.py +++ /dev/null @@ -1,64 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/materialization-predicate/pdintlv_b32 -# family: materialization-predicate -# target_ops: pto.pdintlv_b32 -# scenarios: predicate-transform, lane-order -# coding=utf-8 - -import os -import sys - -import numpy as np - - -EXPECTED_WORDS = 32 -PREFIX_WORDS = 16 - - -def compare_words(golden_path, output_path): - if not os.path.exists(output_path) or not os.path.exists(golden_path): - print(f"[ERROR] Missing file: golden={golden_path} out={output_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint32) - output = np.fromfile(output_path, dtype=np.uint32) - if golden.size != EXPECTED_WORDS or output.size != EXPECTED_WORDS: - print( - f"[ERROR] Unexpected word count: golden={golden.size} " - f"out={output.size} expected={EXPECTED_WORDS}" - ) - return False - golden = golden[:PREFIX_WORDS] - output = output[:PREFIX_WORDS] - if not np.array_equal(golden, output): - diff = np.nonzero(golden != output)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed predicate words): idx={idx} " - f"golden={int(golden[idx])} out={int(output[idx])}" - ) - return False - return True - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_words("golden_v1.bin", "v1.bin") - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/materialization-predicate/pdintlv_b32/golden.py b/test/vpto/cases/micro-op/materialization-predicate/pdintlv_b32/golden.py deleted file mode 100755 index cd1487eef..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/pdintlv_b32/golden.py +++ /dev/null @@ -1,49 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/materialization-predicate/pdintlv_b32 -# family: materialization-predicate -# target_ops: pto.pdintlv_b32 -# scenarios: predicate-transform, lane-order -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -SEED = 19 -OUTPUT_WORDS = 32 -GOLDEN_PREFIX_WORDS = np.array([286331153, 286331153, 286331153, 286331153, 0, 0, 0, 0, 286331153, 286331153, 286331153, 286331153, 0, 0, 0, 0], dtype=np.uint32) - - -def generate(output_dir: Path, seed: int) -> None: - del seed - output_init = np.zeros((OUTPUT_WORDS,), dtype=np.uint32) - golden = np.zeros((OUTPUT_WORDS,), dtype=np.uint32) - golden[: GOLDEN_PREFIX_WORDS.size] = GOLDEN_PREFIX_WORDS - - output_dir.mkdir(parents=True, exist_ok=True) - output_init.tofile(output_dir / "v1.bin") - golden.tofile(output_dir / "golden_v1.bin") - - -def main() -> None: - parser = argparse.ArgumentParser( - description="Generate packed predicate golden for VPTO micro-op validation." - ) - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/materialization-predicate/pdintlv_b32/kernel.pto b/test/vpto/cases/micro-op/materialization-predicate/pdintlv_b32/kernel.pto deleted file mode 100644 index eb6ee504e..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/pdintlv_b32/kernel.pto +++ /dev/null @@ -1,38 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/materialization-predicate/pdintlv_b32 -// family: materialization-predicate -// target_ops: pto.pdintlv_b32 -// scenarios: predicate-transform, lane-order -// ----------------------------------------------------------------------------- -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @pdintlv_b32_kernel_2d(%arg0: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c32 = arith.constant 32 : index - %c16 = arith.constant 16 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c64_i64 = arith.constant 64 : i64 - - %ub_out = pto.castptr %c0_i64 : i64 -> !pto.ptr - %gm_out = pto.castptr %arg0 : !pto.ptr -> !pto.ptr - - pto.vecscope { - scf.for %iter = %c0 to %c1 step %c1 { - %lhs = pto.pset_b32 "PAT_ALL" : !pto.mask - %rhs = pto.pset_b32 "PAT_ALLF" : !pto.mask - %low, %high = pto.pdintlv_b32 %lhs, %rhs : !pto.mask, !pto.mask -> !pto.mask, !pto.mask - pto.psts %low, %ub_out[%c0], "NORM" : !pto.mask, !pto.ptr, index - pto.psts %high, %ub_out[%c32], "NORM" : !pto.mask, !pto.ptr, index - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %gm_out, %c64_i64 - nburst(%c1_i64, %c64_i64, %c64_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/materialization-predicate/pdintlv_b32/launch.cpp b/test/vpto/cases/micro-op/materialization-predicate/pdintlv_b32/launch.cpp deleted file mode 100644 index 316cfc086..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/pdintlv_b32/launch.cpp +++ /dev/null @@ -1,48 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/materialization-predicate/pdintlv_b32 -// family: materialization-predicate -// target_ops: pto.pdintlv_b32 -// scenarios: predicate-transform, lane-order -// --------------------------------------------------------------------------- -// PTOAS compatibility layer -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void pdintlv_b32_kernel_2d(__gm__ uint32_t *v1); - -void LaunchPdintlvB32(uint32_t *v1, void *stream) { - pdintlv_b32_kernel_2d<<<1, nullptr, stream>>>((__gm__ uint32_t *)v1); -} diff --git a/test/vpto/cases/micro-op/materialization-predicate/pdintlv_b32/main.cpp b/test/vpto/cases/micro-op/materialization-predicate/pdintlv_b32/main.cpp deleted file mode 100644 index 7af8a309d..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/pdintlv_b32/main.cpp +++ /dev/null @@ -1,104 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/materialization-predicate/pdintlv_b32 -// family: materialization-predicate -// target_ops: pto.pdintlv_b32 -// scenarios: predicate-transform, lane-order -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -This program is free software, you can redistribute it and/or modify it under the terms and conditions of -CANN Open Software License Agreement Version 2.0 (the "License"). -Please refer to the License for details. You may not use this file except in compliance with the License. -THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -See LICENSE in the root of the software repository for the full text of the License. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include -#include - -using namespace PtoTestCommon; - -#ifndef TMRGSORT_HPP -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif - -#define ACL_CHECK(expr) do { const aclError _ret = (expr); if (_ret != ACL_SUCCESS) { std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, (int)_ret, __FILE__, __LINE__); const char *_recent = aclGetRecentErrMsg(); if (_recent != nullptr && _recent[0] != '\0') { std::fprintf(stderr, "[ERROR] RecentErrMsg: %s\n", _recent); } rc = 1; goto cleanup; } } while (0) - -void LaunchPdintlvB32(uint32_t *v1, void *stream); - -int main() { - size_t elemCount_v1 = 32; - size_t fileSize_v1 = elemCount_v1 * sizeof(uint32_t); - uint32_t *v1Host = nullptr; - uint32_t *v1Device = nullptr; - - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { - deviceId = std::atoi(envDevice); - } - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchPdintlvB32(v1Device, stream); - - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v1Host, fileSize_v1, v1Device, fileSize_v1, ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v1.bin", v1Host, fileSize_v1); - -cleanup: - aclrtFree(v1Device); - aclrtFreeHost(v1Host); - if (stream != nullptr) { - const aclError _ret = aclrtDestroyStream(stream); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtDestroyStream(stream)", (int)_ret, __FILE__, __LINE__); - } - stream = nullptr; - } - if (deviceSet) { - const aclError _ret = aclrtResetDevice(deviceId); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtResetDevice(deviceId)", (int)_ret, __FILE__, __LINE__); - } - } - if (aclInited) { - const aclError _ret = aclFinalize(); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclFinalize()", (int)_ret, __FILE__, __LINE__); - } - } - - return rc; -} diff --git a/test/vpto/cases/micro-op/materialization-predicate/pdintlv_b8-nontrivial/compare.py b/test/vpto/cases/micro-op/materialization-predicate/pdintlv_b8-nontrivial/compare.py deleted file mode 100755 index 12db124bd..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/pdintlv_b8-nontrivial/compare.py +++ /dev/null @@ -1,64 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/materialization-predicate/pdintlv_b8-nontrivial -# family: materialization-predicate -# target_ops: pto.pdintlv_b8 -# scenarios: predicate-transform, lane-order, representative-logical-elements -# coding=utf-8 - -import os -import sys - -import numpy as np - - -EXPECTED_WORDS = 32 -PREFIX_WORDS = 16 - - -def compare_words(golden_path, output_path): - if not os.path.exists(output_path) or not os.path.exists(golden_path): - print(f"[ERROR] Missing file: golden={golden_path} out={output_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint32) - output = np.fromfile(output_path, dtype=np.uint32) - if golden.size != EXPECTED_WORDS or output.size != EXPECTED_WORDS: - print( - f"[ERROR] Unexpected word count: golden={golden.size} " - f"out={output.size} expected={EXPECTED_WORDS}" - ) - return False - golden = golden[:PREFIX_WORDS] - output = output[:PREFIX_WORDS] - if not np.array_equal(golden, output): - diff = np.nonzero(golden != output)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed predicate words): idx={idx} " - f"golden={int(golden[idx])} out={int(output[idx])}" - ) - return False - return True - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_words("golden_v1.bin", "v1.bin") - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/materialization-predicate/pdintlv_b8-nontrivial/golden.py b/test/vpto/cases/micro-op/materialization-predicate/pdintlv_b8-nontrivial/golden.py deleted file mode 100755 index 1c58d3b87..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/pdintlv_b8-nontrivial/golden.py +++ /dev/null @@ -1,49 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/materialization-predicate/pdintlv_b8-nontrivial -# family: materialization-predicate -# target_ops: pto.pdintlv_b8 -# scenarios: predicate-transform, lane-order, representative-logical-elements -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -SEED = 19 -OUTPUT_WORDS = 32 -GOLDEN_PREFIX_WORDS = np.array([15, 0, 0, 0, 1431655765, 1431655765, 1431655765, 1431655765, 15, 0, 0, 0, 0, 0, 0, 0], dtype=np.uint32) - - -def generate(output_dir: Path, seed: int) -> None: - del seed - output_init = np.zeros((OUTPUT_WORDS,), dtype=np.uint32) - golden = np.zeros((OUTPUT_WORDS,), dtype=np.uint32) - golden[: GOLDEN_PREFIX_WORDS.size] = GOLDEN_PREFIX_WORDS - - output_dir.mkdir(parents=True, exist_ok=True) - output_init.tofile(output_dir / "v1.bin") - golden.tofile(output_dir / "golden_v1.bin") - - -def main() -> None: - parser = argparse.ArgumentParser( - description="Generate packed predicate golden for VPTO micro-op validation." - ) - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/materialization-predicate/pdintlv_b8-nontrivial/kernel.pto b/test/vpto/cases/micro-op/materialization-predicate/pdintlv_b8-nontrivial/kernel.pto deleted file mode 100644 index a65d6bec5..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/pdintlv_b8-nontrivial/kernel.pto +++ /dev/null @@ -1,38 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/materialization-predicate/pdintlv_b8-nontrivial -// family: materialization-predicate -// target_ops: pto.pdintlv_b8 -// scenarios: predicate-transform, lane-order, nontrivial-pattern -// ----------------------------------------------------------------------------- -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @pdintlv_b8_nontrivial_kernel_2d(%arg0: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c32 = arith.constant 32 : index - %c16 = arith.constant 16 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c64_i64 = arith.constant 64 : i64 - - %ub_out = pto.castptr %c0_i64 : i64 -> !pto.ptr - %gm_out = pto.castptr %arg0 : !pto.ptr -> !pto.ptr - - pto.vecscope { - scf.for %iter = %c0 to %c1 step %c1 { - %lhs = pto.pset_b8 "PAT_VL8" : !pto.mask - %rhs = pto.pset_b8 "PAT_M4" : !pto.mask - %low, %high = pto.pdintlv_b8 %lhs, %rhs : !pto.mask, !pto.mask -> !pto.mask, !pto.mask - pto.psts %low, %ub_out[%c0], "NORM" : !pto.mask, !pto.ptr, index - pto.psts %high, %ub_out[%c32], "NORM" : !pto.mask, !pto.ptr, index - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %gm_out, %c64_i64 - nburst(%c1_i64, %c64_i64, %c64_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/materialization-predicate/pdintlv_b8-nontrivial/launch.cpp b/test/vpto/cases/micro-op/materialization-predicate/pdintlv_b8-nontrivial/launch.cpp deleted file mode 100644 index e6e2949f5..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/pdintlv_b8-nontrivial/launch.cpp +++ /dev/null @@ -1,48 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/materialization-predicate/pdintlv_b8-nontrivial -// family: materialization-predicate -// target_ops: pto.pdintlv_b8 -// scenarios: predicate-transform, lane-order, nontrivial-pattern -// --------------------------------------------------------------------------- -// PTOAS compatibility layer -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void pdintlv_b8_nontrivial_kernel_2d(__gm__ uint32_t *v1); - -void LaunchPdintlvB8Nontrivial(uint32_t *v1, void *stream) { - pdintlv_b8_nontrivial_kernel_2d<<<1, nullptr, stream>>>((__gm__ uint32_t *)v1); -} diff --git a/test/vpto/cases/micro-op/materialization-predicate/pdintlv_b8-nontrivial/main.cpp b/test/vpto/cases/micro-op/materialization-predicate/pdintlv_b8-nontrivial/main.cpp deleted file mode 100644 index 71e67e085..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/pdintlv_b8-nontrivial/main.cpp +++ /dev/null @@ -1,104 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/materialization-predicate/pdintlv_b8-nontrivial -// family: materialization-predicate -// target_ops: pto.pdintlv_b8 -// scenarios: predicate-transform, lane-order, nontrivial-pattern -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -This program is free software, you can redistribute it and/or modify it under the terms and conditions of -CANN Open Software License Agreement Version 2.0 (the "License"). -Please refer to the License for details. You may not use this file except in compliance with the License. -THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -See LICENSE in the root of the software repository for the full text of the License. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include -#include - -using namespace PtoTestCommon; - -#ifndef TMRGSORT_HPP -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif - -#define ACL_CHECK(expr) do { const aclError _ret = (expr); if (_ret != ACL_SUCCESS) { std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, (int)_ret, __FILE__, __LINE__); const char *_recent = aclGetRecentErrMsg(); if (_recent != nullptr && _recent[0] != '\0') { std::fprintf(stderr, "[ERROR] RecentErrMsg: %s\n", _recent); } rc = 1; goto cleanup; } } while (0) - -void LaunchPdintlvB8Nontrivial(uint32_t *v1, void *stream); - -int main() { - size_t elemCount_v1 = 32; - size_t fileSize_v1 = elemCount_v1 * sizeof(uint32_t); - uint32_t *v1Host = nullptr; - uint32_t *v1Device = nullptr; - - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { - deviceId = std::atoi(envDevice); - } - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchPdintlvB8Nontrivial(v1Device, stream); - - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v1Host, fileSize_v1, v1Device, fileSize_v1, ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v1.bin", v1Host, fileSize_v1); - -cleanup: - aclrtFree(v1Device); - aclrtFreeHost(v1Host); - if (stream != nullptr) { - const aclError _ret = aclrtDestroyStream(stream); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtDestroyStream(stream)", (int)_ret, __FILE__, __LINE__); - } - stream = nullptr; - } - if (deviceSet) { - const aclError _ret = aclrtResetDevice(deviceId); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtResetDevice(deviceId)", (int)_ret, __FILE__, __LINE__); - } - } - if (aclInited) { - const aclError _ret = aclFinalize(); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclFinalize()", (int)_ret, __FILE__, __LINE__); - } - } - - return rc; -} diff --git a/test/vpto/cases/micro-op/materialization-predicate/pdintlv_b8/compare.py b/test/vpto/cases/micro-op/materialization-predicate/pdintlv_b8/compare.py deleted file mode 100755 index 305f17e97..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/pdintlv_b8/compare.py +++ /dev/null @@ -1,64 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/materialization-predicate/pdintlv_b8 -# family: materialization-predicate -# target_ops: pto.pdintlv_b8 -# scenarios: predicate-transform, lane-order -# coding=utf-8 - -import os -import sys - -import numpy as np - - -EXPECTED_WORDS = 32 -PREFIX_WORDS = 16 - - -def compare_words(golden_path, output_path): - if not os.path.exists(output_path) or not os.path.exists(golden_path): - print(f"[ERROR] Missing file: golden={golden_path} out={output_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint32) - output = np.fromfile(output_path, dtype=np.uint32) - if golden.size != EXPECTED_WORDS or output.size != EXPECTED_WORDS: - print( - f"[ERROR] Unexpected word count: golden={golden.size} " - f"out={output.size} expected={EXPECTED_WORDS}" - ) - return False - golden = golden[:PREFIX_WORDS] - output = output[:PREFIX_WORDS] - if not np.array_equal(golden, output): - diff = np.nonzero(golden != output)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed predicate words): idx={idx} " - f"golden={int(golden[idx])} out={int(output[idx])}" - ) - return False - return True - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_words("golden_v1.bin", "v1.bin") - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/materialization-predicate/pdintlv_b8/golden.py b/test/vpto/cases/micro-op/materialization-predicate/pdintlv_b8/golden.py deleted file mode 100755 index e0ed75f95..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/pdintlv_b8/golden.py +++ /dev/null @@ -1,49 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/materialization-predicate/pdintlv_b8 -# family: materialization-predicate -# target_ops: pto.pdintlv_b8 -# scenarios: predicate-transform, lane-order -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -SEED = 19 -OUTPUT_WORDS = 32 -GOLDEN_PREFIX_WORDS = np.array([4294967295, 4294967295, 4294967295, 4294967295, 0, 0, 0, 0, 4294967295, 4294967295, 4294967295, 4294967295, 0, 0, 0, 0], dtype=np.uint32) - - -def generate(output_dir: Path, seed: int) -> None: - del seed - output_init = np.zeros((OUTPUT_WORDS,), dtype=np.uint32) - golden = np.zeros((OUTPUT_WORDS,), dtype=np.uint32) - golden[: GOLDEN_PREFIX_WORDS.size] = GOLDEN_PREFIX_WORDS - - output_dir.mkdir(parents=True, exist_ok=True) - output_init.tofile(output_dir / "v1.bin") - golden.tofile(output_dir / "golden_v1.bin") - - -def main() -> None: - parser = argparse.ArgumentParser( - description="Generate packed predicate golden for VPTO micro-op validation." - ) - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/materialization-predicate/pdintlv_b8/kernel.pto b/test/vpto/cases/micro-op/materialization-predicate/pdintlv_b8/kernel.pto deleted file mode 100644 index 29170023f..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/pdintlv_b8/kernel.pto +++ /dev/null @@ -1,38 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/materialization-predicate/pdintlv_b8 -// family: materialization-predicate -// target_ops: pto.pdintlv_b8 -// scenarios: predicate-transform, lane-order -// ----------------------------------------------------------------------------- -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @pdintlv_b8_kernel_2d(%arg0: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c32 = arith.constant 32 : index - %c16 = arith.constant 16 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c64_i64 = arith.constant 64 : i64 - - %ub_out = pto.castptr %c0_i64 : i64 -> !pto.ptr - %gm_out = pto.castptr %arg0 : !pto.ptr -> !pto.ptr - - pto.vecscope { - scf.for %iter = %c0 to %c1 step %c1 { - %lhs = pto.pset_b8 "PAT_ALL" : !pto.mask - %rhs = pto.pset_b8 "PAT_ALLF" : !pto.mask - %low, %high = pto.pdintlv_b8 %lhs, %rhs : !pto.mask, !pto.mask -> !pto.mask, !pto.mask - pto.psts %low, %ub_out[%c0], "NORM" : !pto.mask, !pto.ptr, index - pto.psts %high, %ub_out[%c32], "NORM" : !pto.mask, !pto.ptr, index - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %gm_out, %c64_i64 - nburst(%c1_i64, %c64_i64, %c64_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/materialization-predicate/pdintlv_b8/launch.cpp b/test/vpto/cases/micro-op/materialization-predicate/pdintlv_b8/launch.cpp deleted file mode 100644 index a8a45dada..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/pdintlv_b8/launch.cpp +++ /dev/null @@ -1,48 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/materialization-predicate/pdintlv_b8 -// family: materialization-predicate -// target_ops: pto.pdintlv_b8 -// scenarios: predicate-transform, lane-order -// --------------------------------------------------------------------------- -// PTOAS compatibility layer -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void pdintlv_b8_kernel_2d(__gm__ uint32_t *v1); - -void LaunchPdintlvB8(uint32_t *v1, void *stream) { - pdintlv_b8_kernel_2d<<<1, nullptr, stream>>>((__gm__ uint32_t *)v1); -} diff --git a/test/vpto/cases/micro-op/materialization-predicate/pdintlv_b8/main.cpp b/test/vpto/cases/micro-op/materialization-predicate/pdintlv_b8/main.cpp deleted file mode 100644 index d0edfba37..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/pdintlv_b8/main.cpp +++ /dev/null @@ -1,104 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/materialization-predicate/pdintlv_b8 -// family: materialization-predicate -// target_ops: pto.pdintlv_b8 -// scenarios: predicate-transform, lane-order -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -This program is free software, you can redistribute it and/or modify it under the terms and conditions of -CANN Open Software License Agreement Version 2.0 (the "License"). -Please refer to the License for details. You may not use this file except in compliance with the License. -THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -See LICENSE in the root of the software repository for the full text of the License. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include -#include - -using namespace PtoTestCommon; - -#ifndef TMRGSORT_HPP -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif - -#define ACL_CHECK(expr) do { const aclError _ret = (expr); if (_ret != ACL_SUCCESS) { std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, (int)_ret, __FILE__, __LINE__); const char *_recent = aclGetRecentErrMsg(); if (_recent != nullptr && _recent[0] != '\0') { std::fprintf(stderr, "[ERROR] RecentErrMsg: %s\n", _recent); } rc = 1; goto cleanup; } } while (0) - -void LaunchPdintlvB8(uint32_t *v1, void *stream); - -int main() { - size_t elemCount_v1 = 32; - size_t fileSize_v1 = elemCount_v1 * sizeof(uint32_t); - uint32_t *v1Host = nullptr; - uint32_t *v1Device = nullptr; - - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { - deviceId = std::atoi(envDevice); - } - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchPdintlvB8(v1Device, stream); - - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v1Host, fileSize_v1, v1Device, fileSize_v1, ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v1.bin", v1Host, fileSize_v1); - -cleanup: - aclrtFree(v1Device); - aclrtFreeHost(v1Host); - if (stream != nullptr) { - const aclError _ret = aclrtDestroyStream(stream); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtDestroyStream(stream)", (int)_ret, __FILE__, __LINE__); - } - stream = nullptr; - } - if (deviceSet) { - const aclError _ret = aclrtResetDevice(deviceId); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtResetDevice(deviceId)", (int)_ret, __FILE__, __LINE__); - } - } - if (aclInited) { - const aclError _ret = aclFinalize(); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclFinalize()", (int)_ret, __FILE__, __LINE__); - } - } - - return rc; -} diff --git a/test/vpto/cases/micro-op/materialization-predicate/pge-tail-mask-boundary/compare.py b/test/vpto/cases/micro-op/materialization-predicate/pge-tail-mask-boundary/compare.py deleted file mode 100755 index 8700b0d56..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/pge-tail-mask-boundary/compare.py +++ /dev/null @@ -1,53 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/materialization-predicate/pge-tail-mask-boundary -# family: materialization-predicate -# target_ops: pto.pge_b16, pto.pge_b32, pto.pge_b8 -# scenarios: tail-mask, boundary -# coding=utf-8 - -import os -import sys -import numpy as np - -EXPECTED_WORDS = 32 - - -def compare_words(golden_path, output_path): - if not os.path.exists(output_path) or not os.path.exists(golden_path): - print(f"[ERROR] Missing file: golden={golden_path} out={output_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint32) - output = np.fromfile(output_path, dtype=np.uint32) - if golden.size != EXPECTED_WORDS or output.size != EXPECTED_WORDS: - print(f"[ERROR] Unexpected word count: golden={golden.size} out={output.size}") - return False - if not np.array_equal(golden, output): - diff = np.nonzero(golden != output)[0] - idx = int(diff[0]) if diff.size else 0 - print(f"[ERROR] Mismatch (packed predicate words): idx={idx} golden={int(golden[idx])} out={int(output[idx])}") - return False - return True - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_words("golden_v1.bin", "v1.bin") - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/materialization-predicate/pge-tail-mask-boundary/golden.py b/test/vpto/cases/micro-op/materialization-predicate/pge-tail-mask-boundary/golden.py deleted file mode 100755 index 67f3b65df..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/pge-tail-mask-boundary/golden.py +++ /dev/null @@ -1,55 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/materialization-predicate/pge-tail-mask-boundary -# family: materialization-predicate -# target_ops: pto.pge_b16, pto.pge_b32, pto.pge_b8 -# scenarios: tail-mask, boundary -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -SEED = 19 -OUTPUT_WORDS = 32 - - -def _pack_prefix(active_lanes: int, bit_stride: int, store_bytes: int) -> np.ndarray: - out = np.zeros((store_bytes,), dtype=np.uint8) - for lane in range(active_lanes): - bit_index = lane * bit_stride - out[bit_index // 8] |= np.uint8(1 << (bit_index % 8)) - return out - - -def generate(output_dir: Path, seed: int) -> None: - del seed - v1 = np.zeros((OUTPUT_WORDS,), dtype=np.uint32) - golden = np.zeros((OUTPUT_WORDS * 4,), dtype=np.uint8) - golden[0:32] = _pack_prefix(active_lanes=1, bit_stride=1, store_bytes=32) - golden[32:64] = _pack_prefix(active_lanes=1, bit_stride=2, store_bytes=32) - golden[64:96] = _pack_prefix(active_lanes=1, bit_stride=4, store_bytes=32) - output_dir.mkdir(parents=True, exist_ok=True) - v1.tofile(output_dir / "v1.bin") - golden.view(np.uint32).tofile(output_dir / "golden_v1.bin") - - -def main() -> None: - parser = argparse.ArgumentParser() - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/materialization-predicate/pge-tail-mask-boundary/kernel.pto b/test/vpto/cases/micro-op/materialization-predicate/pge-tail-mask-boundary/kernel.pto deleted file mode 100644 index f906d01ce..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/pge-tail-mask-boundary/kernel.pto +++ /dev/null @@ -1,39 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/materialization-predicate/pge-tail-mask-boundary -// family: materialization-predicate -// target_ops: pto.pge_b16, pto.pge_b32, pto.pge_b8 -// scenarios: tail-mask, boundary -// ----------------------------------------------------------------------------- -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @pge_tail_mask_boundary_kernel_2d(%arg0: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c32 = arith.constant 32 : index - %c64 = arith.constant 64 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c96_i64 = arith.constant 96 : i64 - - %ub_out = pto.castptr %c0_i64 : i64 -> !pto.ptr - %gm_out = pto.castptr %arg0 : !pto.ptr -> !pto.ptr - - pto.vecscope { - scf.for %iter = %c0 to %c1 step %c1 { - %m0 = pto.pge_b8 "PAT_VL1" : !pto.mask - %m1 = pto.pge_b16 "PAT_VL1" : !pto.mask - %m2 = pto.pge_b32 "PAT_VL1" : !pto.mask - pto.psts %m0, %ub_out[%c0], "NORM" : !pto.mask, !pto.ptr, index - pto.psts %m1, %ub_out[%c32], "NORM" : !pto.mask, !pto.ptr, index - pto.psts %m2, %ub_out[%c64], "NORM" : !pto.mask, !pto.ptr, index - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %gm_out, %c96_i64 - nburst(%c1_i64, %c96_i64, %c96_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/materialization-predicate/pge-tail-mask-boundary/launch.cpp b/test/vpto/cases/micro-op/materialization-predicate/pge-tail-mask-boundary/launch.cpp deleted file mode 100644 index 6a52d74b3..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/pge-tail-mask-boundary/launch.cpp +++ /dev/null @@ -1,48 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/materialization-predicate/pge-tail-mask-boundary -// family: materialization-predicate -// target_ops: pto.pge_b16, pto.pge_b32, pto.pge_b8 -// scenarios: tail-mask, boundary -// --------------------------------------------------------------------------- -// PTOAS compatibility layer -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void pge_tail_mask_boundary_kernel_2d(__gm__ uint32_t *v1); - -void LaunchPgeTailMaskBoundary(uint32_t *v1, void *stream) { - pge_tail_mask_boundary_kernel_2d<<<1, nullptr, stream>>>((__gm__ uint32_t *)v1); -} diff --git a/test/vpto/cases/micro-op/materialization-predicate/pge-tail-mask-boundary/main.cpp b/test/vpto/cases/micro-op/materialization-predicate/pge-tail-mask-boundary/main.cpp deleted file mode 100644 index c0ca90a8c..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/pge-tail-mask-boundary/main.cpp +++ /dev/null @@ -1,104 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/materialization-predicate/pge-tail-mask-boundary -// family: materialization-predicate -// target_ops: pto.pge_b16, pto.pge_b32, pto.pge_b8 -// scenarios: tail-mask, boundary -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -This program is free software, you can redistribute it and/or modify it under the terms and conditions of -CANN Open Software License Agreement Version 2.0 (the "License"). -Please refer to the License for details. You may not use this file except in compliance with the License. -THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -See LICENSE in the root of the software repository for the full text of the License. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include -#include - -using namespace PtoTestCommon; - -#ifndef TMRGSORT_HPP -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif - -#define ACL_CHECK(expr) do { const aclError _ret = (expr); if (_ret != ACL_SUCCESS) { std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, (int)_ret, __FILE__, __LINE__); const char *_recent = aclGetRecentErrMsg(); if (_recent != nullptr && _recent[0] != '\0') { std::fprintf(stderr, "[ERROR] RecentErrMsg: %s\n", _recent); } rc = 1; goto cleanup; } } while (0) - -void LaunchPgeTailMaskBoundary(uint32_t *v1, void *stream); - -int main() { - size_t elemCount_v1 = 32; - size_t fileSize_v1 = elemCount_v1 * sizeof(uint32_t); - uint32_t *v1Host = nullptr; - uint32_t *v1Device = nullptr; - - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { - deviceId = std::atoi(envDevice); - } - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchPgeTailMaskBoundary(v1Device, stream); - - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v1Host, fileSize_v1, v1Device, fileSize_v1, ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v1.bin", v1Host, fileSize_v1); - -cleanup: - aclrtFree(v1Device); - aclrtFreeHost(v1Host); - if (stream != nullptr) { - const aclError _ret = aclrtDestroyStream(stream); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtDestroyStream(stream)", (int)_ret, __FILE__, __LINE__); - } - stream = nullptr; - } - if (deviceSet) { - const aclError _ret = aclrtResetDevice(deviceId); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtResetDevice(deviceId)", (int)_ret, __FILE__, __LINE__); - } - } - if (aclInited) { - const aclError _ret = aclFinalize(); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclFinalize()", (int)_ret, __FILE__, __LINE__); - } - } - - return rc; -} diff --git a/test/vpto/cases/micro-op/materialization-predicate/pge-tail-mask/compare.py b/test/vpto/cases/micro-op/materialization-predicate/pge-tail-mask/compare.py deleted file mode 100755 index 2e598e7ae..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/pge-tail-mask/compare.py +++ /dev/null @@ -1,54 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/materialization-predicate/pge-tail-mask -# family: materialization-predicate -# target_ops: pto.pge_b16, pto.pge_b32, pto.pge_b8 -# scenarios: tail-mask -# NOTE: bulk-generated coverage skeleton. -# coding=utf-8 - -import os -import sys -import numpy as np - -EXPECTED_WORDS = 32 - - -def compare_words(golden_path, output_path): - if not os.path.exists(output_path) or not os.path.exists(golden_path): - print(f"[ERROR] Missing file: golden={golden_path} out={output_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint32) - output = np.fromfile(output_path, dtype=np.uint32) - if golden.size != EXPECTED_WORDS or output.size != EXPECTED_WORDS: - print(f"[ERROR] Unexpected word count: golden={golden.size} out={output.size}") - return False - if not np.array_equal(golden, output): - diff = np.nonzero(golden != output)[0] - idx = int(diff[0]) if diff.size else 0 - print(f"[ERROR] Mismatch (packed predicate words): idx={idx} golden={int(golden[idx])} out={int(output[idx])}") - return False - return True - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_words("golden_v1.bin", "v1.bin") - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/materialization-predicate/pge-tail-mask/golden.py b/test/vpto/cases/micro-op/materialization-predicate/pge-tail-mask/golden.py deleted file mode 100755 index 823fc8889..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/pge-tail-mask/golden.py +++ /dev/null @@ -1,59 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/materialization-predicate/pge-tail-mask -# family: materialization-predicate -# target_ops: pto.pge_b16, pto.pge_b32, pto.pge_b8 -# scenarios: tail-mask -# NOTE: bulk-generated coverage skeleton. -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -SEED = 19 -OUTPUT_WORDS = 32 - - -def _pack_prefix(active_lanes: int, bit_stride: int, store_bytes: int) -> np.ndarray: - out = np.zeros((store_bytes,), dtype=np.uint8) - for lane in range(active_lanes): - bit_index = lane * bit_stride - out[bit_index // 8] |= np.uint8(1 << (bit_index % 8)) - return out - - -def generate(output_dir: Path, seed: int) -> None: - del seed - v1 = np.zeros((OUTPUT_WORDS,), dtype=np.uint32) - golden = np.zeros((OUTPUT_WORDS * 4,), dtype=np.uint8) - golden[0:32] = _pack_prefix(active_lanes=8, bit_stride=1, store_bytes=32) - golden[32:64] = _pack_prefix(active_lanes=8, bit_stride=2, store_bytes=32) - golden[64:96] = _pack_prefix(active_lanes=8, bit_stride=4, store_bytes=32) - - output_dir.mkdir(parents=True, exist_ok=True) - v1.tofile(output_dir / "v1.bin") - golden.view(np.uint32).tofile(output_dir / "golden_v1.bin") - - -def main() -> None: - parser = argparse.ArgumentParser( - description="Generate packed predicate golden for VPTO micro-op pge-tail-mask validation." - ) - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/materialization-predicate/pge-tail-mask/kernel.pto b/test/vpto/cases/micro-op/materialization-predicate/pge-tail-mask/kernel.pto deleted file mode 100644 index 4aa80ce57..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/pge-tail-mask/kernel.pto +++ /dev/null @@ -1,39 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/materialization-predicate/pge-tail-mask -// family: materialization-predicate -// target_ops: pto.pge_b16, pto.pge_b32, pto.pge_b8 -// scenarios: tail-mask -// ----------------------------------------------------------------------------- -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @pge_tail_mask_kernel_2d(%arg0: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c32 = arith.constant 32 : index - %c64 = arith.constant 64 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c96_i64 = arith.constant 96 : i64 - - %ub_out = pto.castptr %c0_i64 : i64 -> !pto.ptr - %gm_out = pto.castptr %arg0 : !pto.ptr -> !pto.ptr - - pto.vecscope { - scf.for %iter = %c0 to %c1 step %c1 { - %m0 = pto.pge_b8 "PAT_VL8" : !pto.mask - %m1 = pto.pge_b16 "PAT_VL8" : !pto.mask - %m2 = pto.pge_b32 "PAT_VL8" : !pto.mask - pto.psts %m0, %ub_out[%c0], "NORM" : !pto.mask, !pto.ptr, index - pto.psts %m1, %ub_out[%c32], "NORM" : !pto.mask, !pto.ptr, index - pto.psts %m2, %ub_out[%c64], "NORM" : !pto.mask, !pto.ptr, index - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %gm_out, %c96_i64 - nburst(%c1_i64, %c96_i64, %c96_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/materialization-predicate/pge-tail-mask/launch.cpp b/test/vpto/cases/micro-op/materialization-predicate/pge-tail-mask/launch.cpp deleted file mode 100644 index c38434d88..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/pge-tail-mask/launch.cpp +++ /dev/null @@ -1,48 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/materialization-predicate/pge-tail-mask -// family: materialization-predicate -// target_ops: pto.pge_b16, pto.pge_b32, pto.pge_b8 -// scenarios: tail-mask -// --------------------------------------------------------------------------- -// PTOAS compatibility layer -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void pge_tail_mask_kernel_2d(__gm__ uint32_t *v1); - -void LaunchPgeTailMask(uint32_t *v1, void *stream) { - pge_tail_mask_kernel_2d<<<1, nullptr, stream>>>((__gm__ uint32_t *)v1); -} diff --git a/test/vpto/cases/micro-op/materialization-predicate/pge-tail-mask/main.cpp b/test/vpto/cases/micro-op/materialization-predicate/pge-tail-mask/main.cpp deleted file mode 100644 index dea4fa6c5..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/pge-tail-mask/main.cpp +++ /dev/null @@ -1,104 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/materialization-predicate/pge-tail-mask -// family: materialization-predicate -// target_ops: pto.pge_b16, pto.pge_b32, pto.pge_b8 -// scenarios: tail-mask -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -This program is free software, you can redistribute it and/or modify it under the terms and conditions of -CANN Open Software License Agreement Version 2.0 (the "License"). -Please refer to the License for details. You may not use this file except in compliance with the License. -THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -See LICENSE in the root of the software repository for the full text of the License. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include -#include - -using namespace PtoTestCommon; - -#ifndef TMRGSORT_HPP -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif - -#define ACL_CHECK(expr) do { const aclError _ret = (expr); if (_ret != ACL_SUCCESS) { std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, (int)_ret, __FILE__, __LINE__); const char *_recent = aclGetRecentErrMsg(); if (_recent != nullptr && _recent[0] != '\0') { std::fprintf(stderr, "[ERROR] RecentErrMsg: %s\n", _recent); } rc = 1; goto cleanup; } } while (0) - -void LaunchPgeTailMask(uint32_t *v1, void *stream); - -int main() { - size_t elemCount_v1 = 32; - size_t fileSize_v1 = elemCount_v1 * sizeof(uint32_t); - uint32_t *v1Host = nullptr; - uint32_t *v1Device = nullptr; - - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { - deviceId = std::atoi(envDevice); - } - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchPgeTailMask(v1Device, stream); - - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v1Host, fileSize_v1, v1Device, fileSize_v1, ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v1.bin", v1Host, fileSize_v1); - -cleanup: - aclrtFree(v1Device); - aclrtFreeHost(v1Host); - if (stream != nullptr) { - const aclError _ret = aclrtDestroyStream(stream); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtDestroyStream(stream)", (int)_ret, __FILE__, __LINE__); - } - stream = nullptr; - } - if (deviceSet) { - const aclError _ret = aclrtResetDevice(deviceId); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtResetDevice(deviceId)", (int)_ret, __FILE__, __LINE__); - } - } - if (aclInited) { - const aclError _ret = aclFinalize(); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclFinalize()", (int)_ret, __FILE__, __LINE__); - } - } - - return rc; -} diff --git a/test/vpto/cases/micro-op/materialization-predicate/pintlv_b16-nontrivial/compare.py b/test/vpto/cases/micro-op/materialization-predicate/pintlv_b16-nontrivial/compare.py deleted file mode 100755 index 7704bbfb7..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/pintlv_b16-nontrivial/compare.py +++ /dev/null @@ -1,64 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/materialization-predicate/pintlv_b16-nontrivial -# family: materialization-predicate -# target_ops: pto.pintlv_b16 -# scenarios: predicate-transform, lane-order, representative-logical-elements -# coding=utf-8 - -import os -import sys - -import numpy as np - - -EXPECTED_WORDS = 32 -PREFIX_WORDS = 16 - - -def compare_words(golden_path, output_path): - if not os.path.exists(output_path) or not os.path.exists(golden_path): - print(f"[ERROR] Missing file: golden={golden_path} out={output_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint32) - output = np.fromfile(output_path, dtype=np.uint32) - if golden.size != EXPECTED_WORDS or output.size != EXPECTED_WORDS: - print( - f"[ERROR] Unexpected word count: golden={golden.size} " - f"out={output.size} expected={EXPECTED_WORDS}" - ) - return False - golden = golden[:PREFIX_WORDS] - output = output[:PREFIX_WORDS] - if not np.array_equal(golden, output): - diff = np.nonzero(golden != output)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed predicate words): idx={idx} " - f"golden={int(golden[idx])} out={int(output[idx])}" - ) - return False - return True - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_words("golden_v1.bin", "v1.bin") - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/materialization-predicate/pintlv_b16-nontrivial/golden.py b/test/vpto/cases/micro-op/materialization-predicate/pintlv_b16-nontrivial/golden.py deleted file mode 100755 index f1729a845..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/pintlv_b16-nontrivial/golden.py +++ /dev/null @@ -1,49 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/materialization-predicate/pintlv_b16-nontrivial -# family: materialization-predicate -# target_ops: pto.pintlv_b16 -# scenarios: predicate-transform, lane-order, representative-logical-elements -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -SEED = 19 -OUTPUT_WORDS = 32 -GOLDEN_PREFIX_WORDS = np.array([286593301, 262148, 262148, 262148, 262148, 262148, 262148, 262148, 262148, 262148, 262148, 262148, 262148, 262148, 262148, 262148], dtype=np.uint32) - - -def generate(output_dir: Path, seed: int) -> None: - del seed - output_init = np.zeros((OUTPUT_WORDS,), dtype=np.uint32) - golden = np.zeros((OUTPUT_WORDS,), dtype=np.uint32) - golden[: GOLDEN_PREFIX_WORDS.size] = GOLDEN_PREFIX_WORDS - - output_dir.mkdir(parents=True, exist_ok=True) - output_init.tofile(output_dir / "v1.bin") - golden.tofile(output_dir / "golden_v1.bin") - - -def main() -> None: - parser = argparse.ArgumentParser( - description="Generate packed predicate golden for VPTO micro-op validation." - ) - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/materialization-predicate/pintlv_b16-nontrivial/kernel.pto b/test/vpto/cases/micro-op/materialization-predicate/pintlv_b16-nontrivial/kernel.pto deleted file mode 100644 index f72ab5de7..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/pintlv_b16-nontrivial/kernel.pto +++ /dev/null @@ -1,38 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/materialization-predicate/pintlv_b16-nontrivial -// family: materialization-predicate -// target_ops: pto.pintlv_b16 -// scenarios: predicate-transform, lane-order, nontrivial-pattern -// ----------------------------------------------------------------------------- -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @pintlv_b16_nontrivial_kernel_2d(%arg0: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c32 = arith.constant 32 : index - %c16 = arith.constant 16 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c64_i64 = arith.constant 64 : i64 - - %ub_out = pto.castptr %c0_i64 : i64 -> !pto.ptr - %gm_out = pto.castptr %arg0 : !pto.ptr -> !pto.ptr - - pto.vecscope { - scf.for %iter = %c0 to %c1 step %c1 { - %lhs = pto.pset_b16 "PAT_VL8" : !pto.mask - %rhs = pto.pset_b16 "PAT_M4" : !pto.mask - %low, %high = pto.pintlv_b16 %lhs, %rhs : !pto.mask, !pto.mask -> !pto.mask, !pto.mask - pto.psts %low, %ub_out[%c0], "NORM" : !pto.mask, !pto.ptr, index - pto.psts %high, %ub_out[%c32], "NORM" : !pto.mask, !pto.ptr, index - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %gm_out, %c64_i64 - nburst(%c1_i64, %c64_i64, %c64_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/materialization-predicate/pintlv_b16-nontrivial/launch.cpp b/test/vpto/cases/micro-op/materialization-predicate/pintlv_b16-nontrivial/launch.cpp deleted file mode 100644 index 57939cac6..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/pintlv_b16-nontrivial/launch.cpp +++ /dev/null @@ -1,48 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/materialization-predicate/pintlv_b16-nontrivial -// family: materialization-predicate -// target_ops: pto.pintlv_b16 -// scenarios: predicate-transform, lane-order, nontrivial-pattern -// --------------------------------------------------------------------------- -// PTOAS compatibility layer -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void pintlv_b16_nontrivial_kernel_2d(__gm__ uint32_t *v1); - -void LaunchPintlvB16Nontrivial(uint32_t *v1, void *stream) { - pintlv_b16_nontrivial_kernel_2d<<<1, nullptr, stream>>>((__gm__ uint32_t *)v1); -} diff --git a/test/vpto/cases/micro-op/materialization-predicate/pintlv_b16-nontrivial/main.cpp b/test/vpto/cases/micro-op/materialization-predicate/pintlv_b16-nontrivial/main.cpp deleted file mode 100644 index aca9caf7a..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/pintlv_b16-nontrivial/main.cpp +++ /dev/null @@ -1,104 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/materialization-predicate/pintlv_b16-nontrivial -// family: materialization-predicate -// target_ops: pto.pintlv_b16 -// scenarios: predicate-transform, lane-order, nontrivial-pattern -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -This program is free software, you can redistribute it and/or modify it under the terms and conditions of -CANN Open Software License Agreement Version 2.0 (the "License"). -Please refer to the License for details. You may not use this file except in compliance with the License. -THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -See LICENSE in the root of the software repository for the full text of the License. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include -#include - -using namespace PtoTestCommon; - -#ifndef TMRGSORT_HPP -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif - -#define ACL_CHECK(expr) do { const aclError _ret = (expr); if (_ret != ACL_SUCCESS) { std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, (int)_ret, __FILE__, __LINE__); const char *_recent = aclGetRecentErrMsg(); if (_recent != nullptr && _recent[0] != '\0') { std::fprintf(stderr, "[ERROR] RecentErrMsg: %s\n", _recent); } rc = 1; goto cleanup; } } while (0) - -void LaunchPintlvB16Nontrivial(uint32_t *v1, void *stream); - -int main() { - size_t elemCount_v1 = 32; - size_t fileSize_v1 = elemCount_v1 * sizeof(uint32_t); - uint32_t *v1Host = nullptr; - uint32_t *v1Device = nullptr; - - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { - deviceId = std::atoi(envDevice); - } - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchPintlvB16Nontrivial(v1Device, stream); - - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v1Host, fileSize_v1, v1Device, fileSize_v1, ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v1.bin", v1Host, fileSize_v1); - -cleanup: - aclrtFree(v1Device); - aclrtFreeHost(v1Host); - if (stream != nullptr) { - const aclError _ret = aclrtDestroyStream(stream); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtDestroyStream(stream)", (int)_ret, __FILE__, __LINE__); - } - stream = nullptr; - } - if (deviceSet) { - const aclError _ret = aclrtResetDevice(deviceId); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtResetDevice(deviceId)", (int)_ret, __FILE__, __LINE__); - } - } - if (aclInited) { - const aclError _ret = aclFinalize(); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclFinalize()", (int)_ret, __FILE__, __LINE__); - } - } - - return rc; -} diff --git a/test/vpto/cases/micro-op/materialization-predicate/pintlv_b16/compare.py b/test/vpto/cases/micro-op/materialization-predicate/pintlv_b16/compare.py deleted file mode 100755 index 9c1deb9c8..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/pintlv_b16/compare.py +++ /dev/null @@ -1,64 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/materialization-predicate/pintlv_b16 -# family: materialization-predicate -# target_ops: pto.pintlv_b16 -# scenarios: predicate-transform, lane-order -# coding=utf-8 - -import os -import sys - -import numpy as np - - -EXPECTED_WORDS = 32 -PREFIX_WORDS = 16 - - -def compare_words(golden_path, output_path): - if not os.path.exists(output_path) or not os.path.exists(golden_path): - print(f"[ERROR] Missing file: golden={golden_path} out={output_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint32) - output = np.fromfile(output_path, dtype=np.uint32) - if golden.size != EXPECTED_WORDS or output.size != EXPECTED_WORDS: - print( - f"[ERROR] Unexpected word count: golden={golden.size} " - f"out={output.size} expected={EXPECTED_WORDS}" - ) - return False - golden = golden[:PREFIX_WORDS] - output = output[:PREFIX_WORDS] - if not np.array_equal(golden, output): - diff = np.nonzero(golden != output)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed predicate words): idx={idx} " - f"golden={int(golden[idx])} out={int(output[idx])}" - ) - return False - return True - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_words("golden_v1.bin", "v1.bin") - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/materialization-predicate/pintlv_b16/golden.py b/test/vpto/cases/micro-op/materialization-predicate/pintlv_b16/golden.py deleted file mode 100755 index a52cba6a2..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/pintlv_b16/golden.py +++ /dev/null @@ -1,49 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/materialization-predicate/pintlv_b16 -# family: materialization-predicate -# target_ops: pto.pintlv_b16 -# scenarios: predicate-transform, lane-order -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -SEED = 19 -OUTPUT_WORDS = 32 -GOLDEN_PREFIX_WORDS = np.array([286331153, 286331153, 286331153, 286331153, 286331153, 286331153, 286331153, 286331153, 286331153, 286331153, 286331153, 286331153, 286331153, 286331153, 286331153, 286331153], dtype=np.uint32) - - -def generate(output_dir: Path, seed: int) -> None: - del seed - output_init = np.zeros((OUTPUT_WORDS,), dtype=np.uint32) - golden = np.zeros((OUTPUT_WORDS,), dtype=np.uint32) - golden[: GOLDEN_PREFIX_WORDS.size] = GOLDEN_PREFIX_WORDS - - output_dir.mkdir(parents=True, exist_ok=True) - output_init.tofile(output_dir / "v1.bin") - golden.tofile(output_dir / "golden_v1.bin") - - -def main() -> None: - parser = argparse.ArgumentParser( - description="Generate packed predicate golden for VPTO micro-op validation." - ) - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/materialization-predicate/pintlv_b16/kernel.pto b/test/vpto/cases/micro-op/materialization-predicate/pintlv_b16/kernel.pto deleted file mode 100644 index 1bbce0777..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/pintlv_b16/kernel.pto +++ /dev/null @@ -1,38 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/materialization-predicate/pintlv_b16 -// family: materialization-predicate -// target_ops: pto.pintlv_b16 -// scenarios: predicate-transform, lane-order -// ----------------------------------------------------------------------------- -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @pintlv_b16_kernel_2d(%arg0: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c32 = arith.constant 32 : index - %c16 = arith.constant 16 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c64_i64 = arith.constant 64 : i64 - - %ub_out = pto.castptr %c0_i64 : i64 -> !pto.ptr - %gm_out = pto.castptr %arg0 : !pto.ptr -> !pto.ptr - - pto.vecscope { - scf.for %iter = %c0 to %c1 step %c1 { - %lhs = pto.pset_b16 "PAT_ALL" : !pto.mask - %rhs = pto.pset_b16 "PAT_ALLF" : !pto.mask - %low, %high = pto.pintlv_b16 %lhs, %rhs : !pto.mask, !pto.mask -> !pto.mask, !pto.mask - pto.psts %low, %ub_out[%c0], "NORM" : !pto.mask, !pto.ptr, index - pto.psts %high, %ub_out[%c32], "NORM" : !pto.mask, !pto.ptr, index - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %gm_out, %c64_i64 - nburst(%c1_i64, %c64_i64, %c64_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/materialization-predicate/pintlv_b16/launch.cpp b/test/vpto/cases/micro-op/materialization-predicate/pintlv_b16/launch.cpp deleted file mode 100644 index 262d87427..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/pintlv_b16/launch.cpp +++ /dev/null @@ -1,48 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/materialization-predicate/pintlv_b16 -// family: materialization-predicate -// target_ops: pto.pintlv_b16 -// scenarios: predicate-transform, lane-order -// --------------------------------------------------------------------------- -// PTOAS compatibility layer -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void pintlv_b16_kernel_2d(__gm__ uint32_t *v1); - -void LaunchPintlvB16(uint32_t *v1, void *stream) { - pintlv_b16_kernel_2d<<<1, nullptr, stream>>>((__gm__ uint32_t *)v1); -} diff --git a/test/vpto/cases/micro-op/materialization-predicate/pintlv_b16/main.cpp b/test/vpto/cases/micro-op/materialization-predicate/pintlv_b16/main.cpp deleted file mode 100644 index 29156f86a..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/pintlv_b16/main.cpp +++ /dev/null @@ -1,104 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/materialization-predicate/pintlv_b16 -// family: materialization-predicate -// target_ops: pto.pintlv_b16 -// scenarios: predicate-transform, lane-order -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -This program is free software, you can redistribute it and/or modify it under the terms and conditions of -CANN Open Software License Agreement Version 2.0 (the "License"). -Please refer to the License for details. You may not use this file except in compliance with the License. -THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -See LICENSE in the root of the software repository for the full text of the License. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include -#include - -using namespace PtoTestCommon; - -#ifndef TMRGSORT_HPP -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif - -#define ACL_CHECK(expr) do { const aclError _ret = (expr); if (_ret != ACL_SUCCESS) { std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, (int)_ret, __FILE__, __LINE__); const char *_recent = aclGetRecentErrMsg(); if (_recent != nullptr && _recent[0] != '\0') { std::fprintf(stderr, "[ERROR] RecentErrMsg: %s\n", _recent); } rc = 1; goto cleanup; } } while (0) - -void LaunchPintlvB16(uint32_t *v1, void *stream); - -int main() { - size_t elemCount_v1 = 32; - size_t fileSize_v1 = elemCount_v1 * sizeof(uint32_t); - uint32_t *v1Host = nullptr; - uint32_t *v1Device = nullptr; - - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { - deviceId = std::atoi(envDevice); - } - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchPintlvB16(v1Device, stream); - - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v1Host, fileSize_v1, v1Device, fileSize_v1, ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v1.bin", v1Host, fileSize_v1); - -cleanup: - aclrtFree(v1Device); - aclrtFreeHost(v1Host); - if (stream != nullptr) { - const aclError _ret = aclrtDestroyStream(stream); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtDestroyStream(stream)", (int)_ret, __FILE__, __LINE__); - } - stream = nullptr; - } - if (deviceSet) { - const aclError _ret = aclrtResetDevice(deviceId); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtResetDevice(deviceId)", (int)_ret, __FILE__, __LINE__); - } - } - if (aclInited) { - const aclError _ret = aclFinalize(); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclFinalize()", (int)_ret, __FILE__, __LINE__); - } - } - - return rc; -} diff --git a/test/vpto/cases/micro-op/materialization-predicate/pintlv_b32-nontrivial/compare.py b/test/vpto/cases/micro-op/materialization-predicate/pintlv_b32-nontrivial/compare.py deleted file mode 100755 index daad8dae1..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/pintlv_b32-nontrivial/compare.py +++ /dev/null @@ -1,64 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/materialization-predicate/pintlv_b32-nontrivial -# family: materialization-predicate -# target_ops: pto.pintlv_b32 -# scenarios: predicate-transform, lane-order, representative-logical-elements -# coding=utf-8 - -import os -import sys - -import numpy as np - - -EXPECTED_WORDS = 32 -PREFIX_WORDS = 16 - - -def compare_words(golden_path, output_path): - if not os.path.exists(output_path) or not os.path.exists(golden_path): - print(f"[ERROR] Missing file: golden={golden_path} out={output_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint32) - output = np.fromfile(output_path, dtype=np.uint32) - if golden.size != EXPECTED_WORDS or output.size != EXPECTED_WORDS: - print( - f"[ERROR] Unexpected word count: golden={golden.size} " - f"out={output.size} expected={EXPECTED_WORDS}" - ) - return False - golden = golden[:PREFIX_WORDS] - output = output[:PREFIX_WORDS] - if not np.array_equal(golden, output): - diff = np.nonzero(golden != output)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed predicate words): idx={idx} " - f"golden={int(golden[idx])} out={int(output[idx])}" - ) - return False - return True - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_words("golden_v1.bin", "v1.bin") - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/materialization-predicate/pintlv_b32-nontrivial/golden.py b/test/vpto/cases/micro-op/materialization-predicate/pintlv_b32-nontrivial/golden.py deleted file mode 100755 index c28bc3f71..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/pintlv_b32-nontrivial/golden.py +++ /dev/null @@ -1,49 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/materialization-predicate/pintlv_b32-nontrivial -# family: materialization-predicate -# target_ops: pto.pintlv_b32 -# scenarios: predicate-transform, lane-order, representative-logical-elements -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -SEED = 19 -OUTPUT_WORDS = 32 -GOLDEN_PREFIX_WORDS = np.array([16843025, 16843025, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16], dtype=np.uint32) - - -def generate(output_dir: Path, seed: int) -> None: - del seed - output_init = np.zeros((OUTPUT_WORDS,), dtype=np.uint32) - golden = np.zeros((OUTPUT_WORDS,), dtype=np.uint32) - golden[: GOLDEN_PREFIX_WORDS.size] = GOLDEN_PREFIX_WORDS - - output_dir.mkdir(parents=True, exist_ok=True) - output_init.tofile(output_dir / "v1.bin") - golden.tofile(output_dir / "golden_v1.bin") - - -def main() -> None: - parser = argparse.ArgumentParser( - description="Generate packed predicate golden for VPTO micro-op validation." - ) - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/materialization-predicate/pintlv_b32-nontrivial/kernel.pto b/test/vpto/cases/micro-op/materialization-predicate/pintlv_b32-nontrivial/kernel.pto deleted file mode 100644 index 8540c0783..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/pintlv_b32-nontrivial/kernel.pto +++ /dev/null @@ -1,38 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/materialization-predicate/pintlv_b32-nontrivial -// family: materialization-predicate -// target_ops: pto.pintlv_b32 -// scenarios: predicate-transform, lane-order, nontrivial-pattern -// ----------------------------------------------------------------------------- -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @pintlv_b32_nontrivial_kernel_2d(%arg0: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c32 = arith.constant 32 : index - %c16 = arith.constant 16 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c64_i64 = arith.constant 64 : i64 - - %ub_out = pto.castptr %c0_i64 : i64 -> !pto.ptr - %gm_out = pto.castptr %arg0 : !pto.ptr -> !pto.ptr - - pto.vecscope { - scf.for %iter = %c0 to %c1 step %c1 { - %lhs = pto.pset_b32 "PAT_VL8" : !pto.mask - %rhs = pto.pset_b32 "PAT_M4" : !pto.mask - %low, %high = pto.pintlv_b32 %lhs, %rhs : !pto.mask, !pto.mask -> !pto.mask, !pto.mask - pto.psts %low, %ub_out[%c0], "NORM" : !pto.mask, !pto.ptr, index - pto.psts %high, %ub_out[%c32], "NORM" : !pto.mask, !pto.ptr, index - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %gm_out, %c64_i64 - nburst(%c1_i64, %c64_i64, %c64_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/materialization-predicate/pintlv_b32-nontrivial/launch.cpp b/test/vpto/cases/micro-op/materialization-predicate/pintlv_b32-nontrivial/launch.cpp deleted file mode 100644 index 06dcd4072..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/pintlv_b32-nontrivial/launch.cpp +++ /dev/null @@ -1,48 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/materialization-predicate/pintlv_b32-nontrivial -// family: materialization-predicate -// target_ops: pto.pintlv_b32 -// scenarios: predicate-transform, lane-order, nontrivial-pattern -// --------------------------------------------------------------------------- -// PTOAS compatibility layer -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void pintlv_b32_nontrivial_kernel_2d(__gm__ uint32_t *v1); - -void LaunchPintlvB32Nontrivial(uint32_t *v1, void *stream) { - pintlv_b32_nontrivial_kernel_2d<<<1, nullptr, stream>>>((__gm__ uint32_t *)v1); -} diff --git a/test/vpto/cases/micro-op/materialization-predicate/pintlv_b32-nontrivial/main.cpp b/test/vpto/cases/micro-op/materialization-predicate/pintlv_b32-nontrivial/main.cpp deleted file mode 100644 index befee95c9..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/pintlv_b32-nontrivial/main.cpp +++ /dev/null @@ -1,104 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/materialization-predicate/pintlv_b32-nontrivial -// family: materialization-predicate -// target_ops: pto.pintlv_b32 -// scenarios: predicate-transform, lane-order, nontrivial-pattern -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -This program is free software, you can redistribute it and/or modify it under the terms and conditions of -CANN Open Software License Agreement Version 2.0 (the "License"). -Please refer to the License for details. You may not use this file except in compliance with the License. -THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -See LICENSE in the root of the software repository for the full text of the License. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include -#include - -using namespace PtoTestCommon; - -#ifndef TMRGSORT_HPP -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif - -#define ACL_CHECK(expr) do { const aclError _ret = (expr); if (_ret != ACL_SUCCESS) { std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, (int)_ret, __FILE__, __LINE__); const char *_recent = aclGetRecentErrMsg(); if (_recent != nullptr && _recent[0] != '\0') { std::fprintf(stderr, "[ERROR] RecentErrMsg: %s\n", _recent); } rc = 1; goto cleanup; } } while (0) - -void LaunchPintlvB32Nontrivial(uint32_t *v1, void *stream); - -int main() { - size_t elemCount_v1 = 32; - size_t fileSize_v1 = elemCount_v1 * sizeof(uint32_t); - uint32_t *v1Host = nullptr; - uint32_t *v1Device = nullptr; - - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { - deviceId = std::atoi(envDevice); - } - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchPintlvB32Nontrivial(v1Device, stream); - - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v1Host, fileSize_v1, v1Device, fileSize_v1, ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v1.bin", v1Host, fileSize_v1); - -cleanup: - aclrtFree(v1Device); - aclrtFreeHost(v1Host); - if (stream != nullptr) { - const aclError _ret = aclrtDestroyStream(stream); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtDestroyStream(stream)", (int)_ret, __FILE__, __LINE__); - } - stream = nullptr; - } - if (deviceSet) { - const aclError _ret = aclrtResetDevice(deviceId); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtResetDevice(deviceId)", (int)_ret, __FILE__, __LINE__); - } - } - if (aclInited) { - const aclError _ret = aclFinalize(); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclFinalize()", (int)_ret, __FILE__, __LINE__); - } - } - - return rc; -} diff --git a/test/vpto/cases/micro-op/materialization-predicate/pintlv_b32/compare.py b/test/vpto/cases/micro-op/materialization-predicate/pintlv_b32/compare.py deleted file mode 100755 index b3050eb69..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/pintlv_b32/compare.py +++ /dev/null @@ -1,64 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/materialization-predicate/pintlv_b32 -# family: materialization-predicate -# target_ops: pto.pintlv_b32 -# scenarios: predicate-transform, lane-order -# coding=utf-8 - -import os -import sys - -import numpy as np - - -EXPECTED_WORDS = 32 -PREFIX_WORDS = 16 - - -def compare_words(golden_path, output_path): - if not os.path.exists(output_path) or not os.path.exists(golden_path): - print(f"[ERROR] Missing file: golden={golden_path} out={output_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint32) - output = np.fromfile(output_path, dtype=np.uint32) - if golden.size != EXPECTED_WORDS or output.size != EXPECTED_WORDS: - print( - f"[ERROR] Unexpected word count: golden={golden.size} " - f"out={output.size} expected={EXPECTED_WORDS}" - ) - return False - golden = golden[:PREFIX_WORDS] - output = output[:PREFIX_WORDS] - if not np.array_equal(golden, output): - diff = np.nonzero(golden != output)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed predicate words): idx={idx} " - f"golden={int(golden[idx])} out={int(output[idx])}" - ) - return False - return True - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_words("golden_v1.bin", "v1.bin") - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/materialization-predicate/pintlv_b32/golden.py b/test/vpto/cases/micro-op/materialization-predicate/pintlv_b32/golden.py deleted file mode 100755 index 67cb39fc8..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/pintlv_b32/golden.py +++ /dev/null @@ -1,49 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/materialization-predicate/pintlv_b32 -# family: materialization-predicate -# target_ops: pto.pintlv_b32 -# scenarios: predicate-transform, lane-order -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -SEED = 19 -OUTPUT_WORDS = 32 -GOLDEN_PREFIX_WORDS = np.array([16843009, 16843009, 16843009, 16843009, 16843009, 16843009, 16843009, 16843009, 16843009, 16843009, 16843009, 16843009, 16843009, 16843009, 16843009, 16843009], dtype=np.uint32) - - -def generate(output_dir: Path, seed: int) -> None: - del seed - output_init = np.zeros((OUTPUT_WORDS,), dtype=np.uint32) - golden = np.zeros((OUTPUT_WORDS,), dtype=np.uint32) - golden[: GOLDEN_PREFIX_WORDS.size] = GOLDEN_PREFIX_WORDS - - output_dir.mkdir(parents=True, exist_ok=True) - output_init.tofile(output_dir / "v1.bin") - golden.tofile(output_dir / "golden_v1.bin") - - -def main() -> None: - parser = argparse.ArgumentParser( - description="Generate packed predicate golden for VPTO micro-op validation." - ) - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/materialization-predicate/pintlv_b32/kernel.pto b/test/vpto/cases/micro-op/materialization-predicate/pintlv_b32/kernel.pto deleted file mode 100644 index d2c3d75ee..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/pintlv_b32/kernel.pto +++ /dev/null @@ -1,38 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/materialization-predicate/pintlv_b32 -// family: materialization-predicate -// target_ops: pto.pintlv_b32 -// scenarios: predicate-transform, lane-order -// ----------------------------------------------------------------------------- -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @pintlv_b32_kernel_2d(%arg0: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c32 = arith.constant 32 : index - %c16 = arith.constant 16 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c64_i64 = arith.constant 64 : i64 - - %ub_out = pto.castptr %c0_i64 : i64 -> !pto.ptr - %gm_out = pto.castptr %arg0 : !pto.ptr -> !pto.ptr - - pto.vecscope { - scf.for %iter = %c0 to %c1 step %c1 { - %lhs = pto.pset_b32 "PAT_ALL" : !pto.mask - %rhs = pto.pset_b32 "PAT_ALLF" : !pto.mask - %low, %high = pto.pintlv_b32 %lhs, %rhs : !pto.mask, !pto.mask -> !pto.mask, !pto.mask - pto.psts %low, %ub_out[%c0], "NORM" : !pto.mask, !pto.ptr, index - pto.psts %high, %ub_out[%c32], "NORM" : !pto.mask, !pto.ptr, index - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %gm_out, %c64_i64 - nburst(%c1_i64, %c64_i64, %c64_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/materialization-predicate/pintlv_b32/launch.cpp b/test/vpto/cases/micro-op/materialization-predicate/pintlv_b32/launch.cpp deleted file mode 100644 index bb990592f..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/pintlv_b32/launch.cpp +++ /dev/null @@ -1,48 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/materialization-predicate/pintlv_b32 -// family: materialization-predicate -// target_ops: pto.pintlv_b32 -// scenarios: predicate-transform, lane-order -// --------------------------------------------------------------------------- -// PTOAS compatibility layer -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void pintlv_b32_kernel_2d(__gm__ uint32_t *v1); - -void LaunchPintlvB32(uint32_t *v1, void *stream) { - pintlv_b32_kernel_2d<<<1, nullptr, stream>>>((__gm__ uint32_t *)v1); -} diff --git a/test/vpto/cases/micro-op/materialization-predicate/pintlv_b32/main.cpp b/test/vpto/cases/micro-op/materialization-predicate/pintlv_b32/main.cpp deleted file mode 100644 index d0ef0696d..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/pintlv_b32/main.cpp +++ /dev/null @@ -1,104 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/materialization-predicate/pintlv_b32 -// family: materialization-predicate -// target_ops: pto.pintlv_b32 -// scenarios: predicate-transform, lane-order -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -This program is free software, you can redistribute it and/or modify it under the terms and conditions of -CANN Open Software License Agreement Version 2.0 (the "License"). -Please refer to the License for details. You may not use this file except in compliance with the License. -THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -See LICENSE in the root of the software repository for the full text of the License. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include -#include - -using namespace PtoTestCommon; - -#ifndef TMRGSORT_HPP -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif - -#define ACL_CHECK(expr) do { const aclError _ret = (expr); if (_ret != ACL_SUCCESS) { std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, (int)_ret, __FILE__, __LINE__); const char *_recent = aclGetRecentErrMsg(); if (_recent != nullptr && _recent[0] != '\0') { std::fprintf(stderr, "[ERROR] RecentErrMsg: %s\n", _recent); } rc = 1; goto cleanup; } } while (0) - -void LaunchPintlvB32(uint32_t *v1, void *stream); - -int main() { - size_t elemCount_v1 = 32; - size_t fileSize_v1 = elemCount_v1 * sizeof(uint32_t); - uint32_t *v1Host = nullptr; - uint32_t *v1Device = nullptr; - - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { - deviceId = std::atoi(envDevice); - } - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchPintlvB32(v1Device, stream); - - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v1Host, fileSize_v1, v1Device, fileSize_v1, ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v1.bin", v1Host, fileSize_v1); - -cleanup: - aclrtFree(v1Device); - aclrtFreeHost(v1Host); - if (stream != nullptr) { - const aclError _ret = aclrtDestroyStream(stream); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtDestroyStream(stream)", (int)_ret, __FILE__, __LINE__); - } - stream = nullptr; - } - if (deviceSet) { - const aclError _ret = aclrtResetDevice(deviceId); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtResetDevice(deviceId)", (int)_ret, __FILE__, __LINE__); - } - } - if (aclInited) { - const aclError _ret = aclFinalize(); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclFinalize()", (int)_ret, __FILE__, __LINE__); - } - } - - return rc; -} diff --git a/test/vpto/cases/micro-op/materialization-predicate/pintlv_b8-nontrivial/compare.py b/test/vpto/cases/micro-op/materialization-predicate/pintlv_b8-nontrivial/compare.py deleted file mode 100755 index 16b0c224d..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/pintlv_b8-nontrivial/compare.py +++ /dev/null @@ -1,64 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/materialization-predicate/pintlv_b8-nontrivial -# family: materialization-predicate -# target_ops: pto.pintlv_b8 -# scenarios: predicate-transform, lane-order, representative-logical-elements -# coding=utf-8 - -import os -import sys - -import numpy as np - - -EXPECTED_WORDS = 32 -PREFIX_WORDS = 16 - - -def compare_words(golden_path, output_path): - if not os.path.exists(output_path) or not os.path.exists(golden_path): - print(f"[ERROR] Missing file: golden={golden_path} out={output_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint32) - output = np.fromfile(output_path, dtype=np.uint32) - if golden.size != EXPECTED_WORDS or output.size != EXPECTED_WORDS: - print( - f"[ERROR] Unexpected word count: golden={golden.size} " - f"out={output.size} expected={EXPECTED_WORDS}" - ) - return False - golden = golden[:PREFIX_WORDS] - output = output[:PREFIX_WORDS] - if not np.array_equal(golden, output): - diff = np.nonzero(golden != output)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed predicate words): idx={idx} " - f"golden={int(golden[idx])} out={int(output[idx])}" - ) - return False - return True - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_words("golden_v1.bin", "v1.bin") - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/materialization-predicate/pintlv_b8-nontrivial/golden.py b/test/vpto/cases/micro-op/materialization-predicate/pintlv_b8-nontrivial/golden.py deleted file mode 100755 index de8ae6216..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/pintlv_b8-nontrivial/golden.py +++ /dev/null @@ -1,49 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/materialization-predicate/pintlv_b8-nontrivial -# family: materialization-predicate -# target_ops: pto.pintlv_b8 -# scenarios: predicate-transform, lane-order, representative-logical-elements -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -SEED = 19 -OUTPUT_WORDS = 32 -GOLDEN_PREFIX_WORDS = np.array([33707863, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018], dtype=np.uint32) - - -def generate(output_dir: Path, seed: int) -> None: - del seed - output_init = np.zeros((OUTPUT_WORDS,), dtype=np.uint32) - golden = np.zeros((OUTPUT_WORDS,), dtype=np.uint32) - golden[: GOLDEN_PREFIX_WORDS.size] = GOLDEN_PREFIX_WORDS - - output_dir.mkdir(parents=True, exist_ok=True) - output_init.tofile(output_dir / "v1.bin") - golden.tofile(output_dir / "golden_v1.bin") - - -def main() -> None: - parser = argparse.ArgumentParser( - description="Generate packed predicate golden for VPTO micro-op validation." - ) - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/materialization-predicate/pintlv_b8-nontrivial/kernel.pto b/test/vpto/cases/micro-op/materialization-predicate/pintlv_b8-nontrivial/kernel.pto deleted file mode 100644 index 1786577ad..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/pintlv_b8-nontrivial/kernel.pto +++ /dev/null @@ -1,38 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/materialization-predicate/pintlv_b8-nontrivial -// family: materialization-predicate -// target_ops: pto.pintlv_b8 -// scenarios: predicate-transform, lane-order, nontrivial-pattern -// ----------------------------------------------------------------------------- -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @pintlv_b8_nontrivial_kernel_2d(%arg0: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c32 = arith.constant 32 : index - %c16 = arith.constant 16 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c64_i64 = arith.constant 64 : i64 - - %ub_out = pto.castptr %c0_i64 : i64 -> !pto.ptr - %gm_out = pto.castptr %arg0 : !pto.ptr -> !pto.ptr - - pto.vecscope { - scf.for %iter = %c0 to %c1 step %c1 { - %lhs = pto.pset_b8 "PAT_VL8" : !pto.mask - %rhs = pto.pset_b8 "PAT_M4" : !pto.mask - %low, %high = pto.pintlv_b8 %lhs, %rhs : !pto.mask, !pto.mask -> !pto.mask, !pto.mask - pto.psts %low, %ub_out[%c0], "NORM" : !pto.mask, !pto.ptr, index - pto.psts %high, %ub_out[%c32], "NORM" : !pto.mask, !pto.ptr, index - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %gm_out, %c64_i64 - nburst(%c1_i64, %c64_i64, %c64_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/materialization-predicate/pintlv_b8-nontrivial/launch.cpp b/test/vpto/cases/micro-op/materialization-predicate/pintlv_b8-nontrivial/launch.cpp deleted file mode 100644 index c466ef9c8..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/pintlv_b8-nontrivial/launch.cpp +++ /dev/null @@ -1,48 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/materialization-predicate/pintlv_b8-nontrivial -// family: materialization-predicate -// target_ops: pto.pintlv_b8 -// scenarios: predicate-transform, lane-order, nontrivial-pattern -// --------------------------------------------------------------------------- -// PTOAS compatibility layer -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void pintlv_b8_nontrivial_kernel_2d(__gm__ uint32_t *v1); - -void LaunchPintlvB8Nontrivial(uint32_t *v1, void *stream) { - pintlv_b8_nontrivial_kernel_2d<<<1, nullptr, stream>>>((__gm__ uint32_t *)v1); -} diff --git a/test/vpto/cases/micro-op/materialization-predicate/pintlv_b8-nontrivial/main.cpp b/test/vpto/cases/micro-op/materialization-predicate/pintlv_b8-nontrivial/main.cpp deleted file mode 100644 index d27a5f08c..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/pintlv_b8-nontrivial/main.cpp +++ /dev/null @@ -1,104 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/materialization-predicate/pintlv_b8-nontrivial -// family: materialization-predicate -// target_ops: pto.pintlv_b8 -// scenarios: predicate-transform, lane-order, nontrivial-pattern -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -This program is free software, you can redistribute it and/or modify it under the terms and conditions of -CANN Open Software License Agreement Version 2.0 (the "License"). -Please refer to the License for details. You may not use this file except in compliance with the License. -THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -See LICENSE in the root of the software repository for the full text of the License. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include -#include - -using namespace PtoTestCommon; - -#ifndef TMRGSORT_HPP -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif - -#define ACL_CHECK(expr) do { const aclError _ret = (expr); if (_ret != ACL_SUCCESS) { std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, (int)_ret, __FILE__, __LINE__); const char *_recent = aclGetRecentErrMsg(); if (_recent != nullptr && _recent[0] != '\0') { std::fprintf(stderr, "[ERROR] RecentErrMsg: %s\n", _recent); } rc = 1; goto cleanup; } } while (0) - -void LaunchPintlvB8Nontrivial(uint32_t *v1, void *stream); - -int main() { - size_t elemCount_v1 = 32; - size_t fileSize_v1 = elemCount_v1 * sizeof(uint32_t); - uint32_t *v1Host = nullptr; - uint32_t *v1Device = nullptr; - - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { - deviceId = std::atoi(envDevice); - } - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchPintlvB8Nontrivial(v1Device, stream); - - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v1Host, fileSize_v1, v1Device, fileSize_v1, ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v1.bin", v1Host, fileSize_v1); - -cleanup: - aclrtFree(v1Device); - aclrtFreeHost(v1Host); - if (stream != nullptr) { - const aclError _ret = aclrtDestroyStream(stream); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtDestroyStream(stream)", (int)_ret, __FILE__, __LINE__); - } - stream = nullptr; - } - if (deviceSet) { - const aclError _ret = aclrtResetDevice(deviceId); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtResetDevice(deviceId)", (int)_ret, __FILE__, __LINE__); - } - } - if (aclInited) { - const aclError _ret = aclFinalize(); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclFinalize()", (int)_ret, __FILE__, __LINE__); - } - } - - return rc; -} diff --git a/test/vpto/cases/micro-op/materialization-predicate/pintlv_b8/compare.py b/test/vpto/cases/micro-op/materialization-predicate/pintlv_b8/compare.py deleted file mode 100755 index d6cae1168..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/pintlv_b8/compare.py +++ /dev/null @@ -1,64 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/materialization-predicate/pintlv_b8 -# family: materialization-predicate -# target_ops: pto.pintlv_b8 -# scenarios: predicate-transform, lane-order -# coding=utf-8 - -import os -import sys - -import numpy as np - - -EXPECTED_WORDS = 32 -PREFIX_WORDS = 16 - - -def compare_words(golden_path, output_path): - if not os.path.exists(output_path) or not os.path.exists(golden_path): - print(f"[ERROR] Missing file: golden={golden_path} out={output_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint32) - output = np.fromfile(output_path, dtype=np.uint32) - if golden.size != EXPECTED_WORDS or output.size != EXPECTED_WORDS: - print( - f"[ERROR] Unexpected word count: golden={golden.size} " - f"out={output.size} expected={EXPECTED_WORDS}" - ) - return False - golden = golden[:PREFIX_WORDS] - output = output[:PREFIX_WORDS] - if not np.array_equal(golden, output): - diff = np.nonzero(golden != output)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed predicate words): idx={idx} " - f"golden={int(golden[idx])} out={int(output[idx])}" - ) - return False - return True - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_words("golden_v1.bin", "v1.bin") - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/materialization-predicate/pintlv_b8/golden.py b/test/vpto/cases/micro-op/materialization-predicate/pintlv_b8/golden.py deleted file mode 100755 index bae1a196c..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/pintlv_b8/golden.py +++ /dev/null @@ -1,49 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/materialization-predicate/pintlv_b8 -# family: materialization-predicate -# target_ops: pto.pintlv_b8 -# scenarios: predicate-transform, lane-order -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -SEED = 19 -OUTPUT_WORDS = 32 -GOLDEN_PREFIX_WORDS = np.array([1431655765, 1431655765, 1431655765, 1431655765, 1431655765, 1431655765, 1431655765, 1431655765, 1431655765, 1431655765, 1431655765, 1431655765, 1431655765, 1431655765, 1431655765, 1431655765], dtype=np.uint32) - - -def generate(output_dir: Path, seed: int) -> None: - del seed - output_init = np.zeros((OUTPUT_WORDS,), dtype=np.uint32) - golden = np.zeros((OUTPUT_WORDS,), dtype=np.uint32) - golden[: GOLDEN_PREFIX_WORDS.size] = GOLDEN_PREFIX_WORDS - - output_dir.mkdir(parents=True, exist_ok=True) - output_init.tofile(output_dir / "v1.bin") - golden.tofile(output_dir / "golden_v1.bin") - - -def main() -> None: - parser = argparse.ArgumentParser( - description="Generate packed predicate golden for VPTO micro-op validation." - ) - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/materialization-predicate/pintlv_b8/kernel.pto b/test/vpto/cases/micro-op/materialization-predicate/pintlv_b8/kernel.pto deleted file mode 100644 index 863cffee4..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/pintlv_b8/kernel.pto +++ /dev/null @@ -1,38 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/materialization-predicate/pintlv_b8 -// family: materialization-predicate -// target_ops: pto.pintlv_b8 -// scenarios: predicate-transform, lane-order -// ----------------------------------------------------------------------------- -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @pintlv_b8_kernel_2d(%arg0: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c32 = arith.constant 32 : index - %c16 = arith.constant 16 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c64_i64 = arith.constant 64 : i64 - - %ub_out = pto.castptr %c0_i64 : i64 -> !pto.ptr - %gm_out = pto.castptr %arg0 : !pto.ptr -> !pto.ptr - - pto.vecscope { - scf.for %iter = %c0 to %c1 step %c1 { - %lhs = pto.pset_b8 "PAT_ALL" : !pto.mask - %rhs = pto.pset_b8 "PAT_ALLF" : !pto.mask - %low, %high = pto.pintlv_b8 %lhs, %rhs : !pto.mask, !pto.mask -> !pto.mask, !pto.mask - pto.psts %low, %ub_out[%c0], "NORM" : !pto.mask, !pto.ptr, index - pto.psts %high, %ub_out[%c32], "NORM" : !pto.mask, !pto.ptr, index - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %gm_out, %c64_i64 - nburst(%c1_i64, %c64_i64, %c64_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/materialization-predicate/pintlv_b8/launch.cpp b/test/vpto/cases/micro-op/materialization-predicate/pintlv_b8/launch.cpp deleted file mode 100644 index d0299e575..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/pintlv_b8/launch.cpp +++ /dev/null @@ -1,48 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/materialization-predicate/pintlv_b8 -// family: materialization-predicate -// target_ops: pto.pintlv_b8 -// scenarios: predicate-transform, lane-order -// --------------------------------------------------------------------------- -// PTOAS compatibility layer -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void pintlv_b8_kernel_2d(__gm__ uint32_t *v1); - -void LaunchPintlvB8(uint32_t *v1, void *stream) { - pintlv_b8_kernel_2d<<<1, nullptr, stream>>>((__gm__ uint32_t *)v1); -} diff --git a/test/vpto/cases/micro-op/materialization-predicate/pintlv_b8/main.cpp b/test/vpto/cases/micro-op/materialization-predicate/pintlv_b8/main.cpp deleted file mode 100644 index b4b856773..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/pintlv_b8/main.cpp +++ /dev/null @@ -1,104 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/materialization-predicate/pintlv_b8 -// family: materialization-predicate -// target_ops: pto.pintlv_b8 -// scenarios: predicate-transform, lane-order -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -This program is free software, you can redistribute it and/or modify it under the terms and conditions of -CANN Open Software License Agreement Version 2.0 (the "License"). -Please refer to the License for details. You may not use this file except in compliance with the License. -THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -See LICENSE in the root of the software repository for the full text of the License. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include -#include - -using namespace PtoTestCommon; - -#ifndef TMRGSORT_HPP -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif - -#define ACL_CHECK(expr) do { const aclError _ret = (expr); if (_ret != ACL_SUCCESS) { std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, (int)_ret, __FILE__, __LINE__); const char *_recent = aclGetRecentErrMsg(); if (_recent != nullptr && _recent[0] != '\0') { std::fprintf(stderr, "[ERROR] RecentErrMsg: %s\n", _recent); } rc = 1; goto cleanup; } } while (0) - -void LaunchPintlvB8(uint32_t *v1, void *stream); - -int main() { - size_t elemCount_v1 = 32; - size_t fileSize_v1 = elemCount_v1 * sizeof(uint32_t); - uint32_t *v1Host = nullptr; - uint32_t *v1Device = nullptr; - - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { - deviceId = std::atoi(envDevice); - } - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchPintlvB8(v1Device, stream); - - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v1Host, fileSize_v1, v1Device, fileSize_v1, ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v1.bin", v1Host, fileSize_v1); - -cleanup: - aclrtFree(v1Device); - aclrtFreeHost(v1Host); - if (stream != nullptr) { - const aclError _ret = aclrtDestroyStream(stream); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtDestroyStream(stream)", (int)_ret, __FILE__, __LINE__); - } - stream = nullptr; - } - if (deviceSet) { - const aclError _ret = aclrtResetDevice(deviceId); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtResetDevice(deviceId)", (int)_ret, __FILE__, __LINE__); - } - } - if (aclInited) { - const aclError _ret = aclFinalize(); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclFinalize()", (int)_ret, __FILE__, __LINE__); - } - } - - return rc; -} diff --git a/test/vpto/cases/micro-op/materialization-predicate/plt-tail-mask-boundary/compare.py b/test/vpto/cases/micro-op/materialization-predicate/plt-tail-mask-boundary/compare.py deleted file mode 100755 index 8eac93173..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/plt-tail-mask-boundary/compare.py +++ /dev/null @@ -1,53 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/materialization-predicate/plt-tail-mask-boundary -# family: materialization-predicate -# target_ops: pto.plt_b16, pto.plt_b32, pto.plt_b8 -# scenarios: tail-mask, scalar-carry-out, boundary -# coding=utf-8 - -import os -import sys -import numpy as np - -EXPECTED_WORDS = 32 - - -def compare_words(golden_path, output_path): - if not os.path.exists(output_path) or not os.path.exists(golden_path): - print(f"[ERROR] Missing file: golden={golden_path} out={output_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint32) - output = np.fromfile(output_path, dtype=np.uint32) - if golden.size != EXPECTED_WORDS or output.size != EXPECTED_WORDS: - print(f"[ERROR] Unexpected word count: golden={golden.size} out={output.size}") - return False - if not np.array_equal(golden, output): - diff = np.nonzero(golden != output)[0] - idx = int(diff[0]) if diff.size else 0 - print(f"[ERROR] Mismatch (packed predicate words): idx={idx} golden={int(golden[idx])} out={int(output[idx])}") - return False - return True - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_words("golden_v1.bin", "v1.bin") - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/materialization-predicate/plt-tail-mask-boundary/golden.py b/test/vpto/cases/micro-op/materialization-predicate/plt-tail-mask-boundary/golden.py deleted file mode 100755 index c812554d8..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/plt-tail-mask-boundary/golden.py +++ /dev/null @@ -1,55 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/materialization-predicate/plt-tail-mask-boundary -# family: materialization-predicate -# target_ops: pto.plt_b16, pto.plt_b32, pto.plt_b8 -# scenarios: tail-mask, scalar-carry-out, boundary -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -SEED = 19 -OUTPUT_WORDS = 32 - - -def _pack_prefix(active_lanes: int, bit_stride: int, store_bytes: int) -> np.ndarray: - out = np.zeros((store_bytes,), dtype=np.uint8) - for lane in range(active_lanes): - bit_index = lane * bit_stride - out[bit_index // 8] |= np.uint8(1 << (bit_index % 8)) - return out - - -def generate(output_dir: Path, seed: int) -> None: - del seed - v1 = np.zeros((OUTPUT_WORDS,), dtype=np.uint32) - golden = np.zeros((OUTPUT_WORDS * 4,), dtype=np.uint8) - golden[0:32] = _pack_prefix(active_lanes=1, bit_stride=1, store_bytes=32) - golden[32:64] = _pack_prefix(active_lanes=1, bit_stride=2, store_bytes=32) - golden[64:96] = _pack_prefix(active_lanes=1, bit_stride=4, store_bytes=32) - output_dir.mkdir(parents=True, exist_ok=True) - v1.tofile(output_dir / "v1.bin") - golden.view(np.uint32).tofile(output_dir / "golden_v1.bin") - - -def main() -> None: - parser = argparse.ArgumentParser() - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/materialization-predicate/plt-tail-mask-boundary/kernel.pto b/test/vpto/cases/micro-op/materialization-predicate/plt-tail-mask-boundary/kernel.pto deleted file mode 100644 index 6aff2409b..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/plt-tail-mask-boundary/kernel.pto +++ /dev/null @@ -1,40 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/materialization-predicate/plt-tail-mask-boundary -// family: materialization-predicate -// target_ops: pto.plt_b16, pto.plt_b32, pto.plt_b8 -// scenarios: tail-mask, scalar-carry-out, boundary -// ----------------------------------------------------------------------------- -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @plt_tail_mask_boundary_kernel_2d(%arg0: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c32 = arith.constant 32 : index - %c64 = arith.constant 64 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c96_i64 = arith.constant 96 : i64 - %c1_i32 = arith.constant 1 : i32 - - %ub_out = pto.castptr %c0_i64 : i64 -> !pto.ptr - %gm_out = pto.castptr %arg0 : !pto.ptr -> !pto.ptr - - pto.vecscope { - scf.for %iter = %c0 to %c1 step %c1 { - %m0, %s0 = pto.plt_b8 %c1_i32 : i32 -> !pto.mask, i32 - %m1, %s1 = pto.plt_b16 %c1_i32 : i32 -> !pto.mask, i32 - %m2, %s2 = pto.plt_b32 %c1_i32 : i32 -> !pto.mask, i32 - pto.psts %m0, %ub_out[%c0], "NORM" : !pto.mask, !pto.ptr, index - pto.psts %m1, %ub_out[%c32], "NORM" : !pto.mask, !pto.ptr, index - pto.psts %m2, %ub_out[%c64], "NORM" : !pto.mask, !pto.ptr, index - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %gm_out, %c96_i64 - nburst(%c1_i64, %c96_i64, %c96_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/materialization-predicate/plt-tail-mask-boundary/launch.cpp b/test/vpto/cases/micro-op/materialization-predicate/plt-tail-mask-boundary/launch.cpp deleted file mode 100644 index fad09577d..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/plt-tail-mask-boundary/launch.cpp +++ /dev/null @@ -1,48 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/materialization-predicate/plt-tail-mask-boundary -// family: materialization-predicate -// target_ops: pto.plt_b16, pto.plt_b32, pto.plt_b8 -// scenarios: tail-mask, scalar-carry-out, boundary -// --------------------------------------------------------------------------- -// PTOAS compatibility layer -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void plt_tail_mask_boundary_kernel_2d(__gm__ uint32_t *v1); - -void LaunchPltTailMaskBoundary(uint32_t *v1, void *stream) { - plt_tail_mask_boundary_kernel_2d<<<1, nullptr, stream>>>((__gm__ uint32_t *)v1); -} diff --git a/test/vpto/cases/micro-op/materialization-predicate/plt-tail-mask-boundary/main.cpp b/test/vpto/cases/micro-op/materialization-predicate/plt-tail-mask-boundary/main.cpp deleted file mode 100644 index 0dfe9d502..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/plt-tail-mask-boundary/main.cpp +++ /dev/null @@ -1,104 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/materialization-predicate/plt-tail-mask-boundary -// family: materialization-predicate -// target_ops: pto.plt_b16, pto.plt_b32, pto.plt_b8 -// scenarios: tail-mask, scalar-carry-out, boundary -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -This program is free software, you can redistribute it and/or modify it under the terms and conditions of -CANN Open Software License Agreement Version 2.0 (the "License"). -Please refer to the License for details. You may not use this file except in compliance with the License. -THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -See LICENSE in the root of the software repository for the full text of the License. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include -#include - -using namespace PtoTestCommon; - -#ifndef TMRGSORT_HPP -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif - -#define ACL_CHECK(expr) do { const aclError _ret = (expr); if (_ret != ACL_SUCCESS) { std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, (int)_ret, __FILE__, __LINE__); const char *_recent = aclGetRecentErrMsg(); if (_recent != nullptr && _recent[0] != '\0') { std::fprintf(stderr, "[ERROR] RecentErrMsg: %s\n", _recent); } rc = 1; goto cleanup; } } while (0) - -void LaunchPltTailMaskBoundary(uint32_t *v1, void *stream); - -int main() { - size_t elemCount_v1 = 32; - size_t fileSize_v1 = elemCount_v1 * sizeof(uint32_t); - uint32_t *v1Host = nullptr; - uint32_t *v1Device = nullptr; - - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { - deviceId = std::atoi(envDevice); - } - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchPltTailMaskBoundary(v1Device, stream); - - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v1Host, fileSize_v1, v1Device, fileSize_v1, ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v1.bin", v1Host, fileSize_v1); - -cleanup: - aclrtFree(v1Device); - aclrtFreeHost(v1Host); - if (stream != nullptr) { - const aclError _ret = aclrtDestroyStream(stream); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtDestroyStream(stream)", (int)_ret, __FILE__, __LINE__); - } - stream = nullptr; - } - if (deviceSet) { - const aclError _ret = aclrtResetDevice(deviceId); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtResetDevice(deviceId)", (int)_ret, __FILE__, __LINE__); - } - } - if (aclInited) { - const aclError _ret = aclFinalize(); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclFinalize()", (int)_ret, __FILE__, __LINE__); - } - } - - return rc; -} diff --git a/test/vpto/cases/micro-op/materialization-predicate/plt-tail-mask/compare.py b/test/vpto/cases/micro-op/materialization-predicate/plt-tail-mask/compare.py deleted file mode 100755 index b2466b6ab..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/plt-tail-mask/compare.py +++ /dev/null @@ -1,54 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/materialization-predicate/plt-tail-mask -# family: materialization-predicate -# target_ops: pto.plt_b16, pto.plt_b32, pto.plt_b8 -# scenarios: tail-mask, scalar-carry-out -# NOTE: bulk-generated coverage skeleton. -# coding=utf-8 - -import os -import sys -import numpy as np - -EXPECTED_WORDS = 32 - - -def compare_words(golden_path, output_path): - if not os.path.exists(output_path) or not os.path.exists(golden_path): - print(f"[ERROR] Missing file: golden={golden_path} out={output_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint32) - output = np.fromfile(output_path, dtype=np.uint32) - if golden.size != EXPECTED_WORDS or output.size != EXPECTED_WORDS: - print(f"[ERROR] Unexpected word count: golden={golden.size} out={output.size}") - return False - if not np.array_equal(golden, output): - diff = np.nonzero(golden != output)[0] - idx = int(diff[0]) if diff.size else 0 - print(f"[ERROR] Mismatch (packed predicate words): idx={idx} golden={int(golden[idx])} out={int(output[idx])}") - return False - return True - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_words("golden_v1.bin", "v1.bin") - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/materialization-predicate/plt-tail-mask/golden.py b/test/vpto/cases/micro-op/materialization-predicate/plt-tail-mask/golden.py deleted file mode 100755 index 1ef2a1a79..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/plt-tail-mask/golden.py +++ /dev/null @@ -1,59 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/materialization-predicate/plt-tail-mask -# family: materialization-predicate -# target_ops: pto.plt_b16, pto.plt_b32, pto.plt_b8 -# scenarios: tail-mask, scalar-carry-out -# NOTE: bulk-generated coverage skeleton. -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -SEED = 19 -OUTPUT_WORDS = 32 - - -def _pack_prefix(active_lanes: int, bit_stride: int, store_bytes: int) -> np.ndarray: - out = np.zeros((store_bytes,), dtype=np.uint8) - for lane in range(active_lanes): - bit_index = lane * bit_stride - out[bit_index // 8] |= np.uint8(1 << (bit_index % 8)) - return out - - -def generate(output_dir: Path, seed: int) -> None: - del seed - v1 = np.zeros((OUTPUT_WORDS,), dtype=np.uint32) - golden = np.zeros((OUTPUT_WORDS * 4,), dtype=np.uint8) - golden[0:32] = _pack_prefix(active_lanes=13, bit_stride=1, store_bytes=32) - golden[32:64] = _pack_prefix(active_lanes=7, bit_stride=2, store_bytes=32) - golden[64:96] = _pack_prefix(active_lanes=3, bit_stride=4, store_bytes=32) - - output_dir.mkdir(parents=True, exist_ok=True) - v1.tofile(output_dir / "v1.bin") - golden.view(np.uint32).tofile(output_dir / "golden_v1.bin") - - -def main() -> None: - parser = argparse.ArgumentParser( - description="Generate packed predicate golden for VPTO micro-op plt-tail-mask validation." - ) - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/materialization-predicate/plt-tail-mask/kernel.pto b/test/vpto/cases/micro-op/materialization-predicate/plt-tail-mask/kernel.pto deleted file mode 100644 index 869d06b17..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/plt-tail-mask/kernel.pto +++ /dev/null @@ -1,42 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/materialization-predicate/plt-tail-mask -// family: materialization-predicate -// target_ops: pto.plt_b16, pto.plt_b32, pto.plt_b8 -// scenarios: tail-mask, scalar-carry-out -// ----------------------------------------------------------------------------- -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @plt_tail_mask_kernel_2d(%arg0: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c32 = arith.constant 32 : index - %c64 = arith.constant 64 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c96_i64 = arith.constant 96 : i64 - %c13 = arith.constant 13 : i32 - %c7 = arith.constant 7 : i32 - %c3 = arith.constant 3 : i32 - - %ub_out = pto.castptr %c0_i64 : i64 -> !pto.ptr - %gm_out = pto.castptr %arg0 : !pto.ptr -> !pto.ptr - - pto.vecscope { - scf.for %iter = %c0 to %c1 step %c1 { - %m0, %s0 = pto.plt_b8 %c13 : i32 -> !pto.mask, i32 - %m1, %s1 = pto.plt_b16 %c7 : i32 -> !pto.mask, i32 - %m2, %s2 = pto.plt_b32 %c3 : i32 -> !pto.mask, i32 - pto.psts %m0, %ub_out[%c0], "NORM" : !pto.mask, !pto.ptr, index - pto.psts %m1, %ub_out[%c32], "NORM" : !pto.mask, !pto.ptr, index - pto.psts %m2, %ub_out[%c64], "NORM" : !pto.mask, !pto.ptr, index - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %gm_out, %c96_i64 - nburst(%c1_i64, %c96_i64, %c96_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/materialization-predicate/plt-tail-mask/launch.cpp b/test/vpto/cases/micro-op/materialization-predicate/plt-tail-mask/launch.cpp deleted file mode 100644 index 1c9b21d24..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/plt-tail-mask/launch.cpp +++ /dev/null @@ -1,48 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/materialization-predicate/plt-tail-mask -// family: materialization-predicate -// target_ops: pto.plt_b16, pto.plt_b32, pto.plt_b8 -// scenarios: tail-mask, scalar-carry-out -// --------------------------------------------------------------------------- -// PTOAS compatibility layer -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void plt_tail_mask_kernel_2d(__gm__ uint32_t *v1); - -void LaunchPltTailMask(uint32_t *v1, void *stream) { - plt_tail_mask_kernel_2d<<<1, nullptr, stream>>>((__gm__ uint32_t *)v1); -} diff --git a/test/vpto/cases/micro-op/materialization-predicate/plt-tail-mask/main.cpp b/test/vpto/cases/micro-op/materialization-predicate/plt-tail-mask/main.cpp deleted file mode 100644 index 7fdb5fe93..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/plt-tail-mask/main.cpp +++ /dev/null @@ -1,104 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/materialization-predicate/plt-tail-mask -// family: materialization-predicate -// target_ops: pto.plt_b16, pto.plt_b32, pto.plt_b8 -// scenarios: tail-mask, scalar-carry-out -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -This program is free software, you can redistribute it and/or modify it under the terms and conditions of -CANN Open Software License Agreement Version 2.0 (the "License"). -Please refer to the License for details. You may not use this file except in compliance with the License. -THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -See LICENSE in the root of the software repository for the full text of the License. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include -#include - -using namespace PtoTestCommon; - -#ifndef TMRGSORT_HPP -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif - -#define ACL_CHECK(expr) do { const aclError _ret = (expr); if (_ret != ACL_SUCCESS) { std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, (int)_ret, __FILE__, __LINE__); const char *_recent = aclGetRecentErrMsg(); if (_recent != nullptr && _recent[0] != '\0') { std::fprintf(stderr, "[ERROR] RecentErrMsg: %s\n", _recent); } rc = 1; goto cleanup; } } while (0) - -void LaunchPltTailMask(uint32_t *v1, void *stream); - -int main() { - size_t elemCount_v1 = 32; - size_t fileSize_v1 = elemCount_v1 * sizeof(uint32_t); - uint32_t *v1Host = nullptr; - uint32_t *v1Device = nullptr; - - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { - deviceId = std::atoi(envDevice); - } - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchPltTailMask(v1Device, stream); - - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v1Host, fileSize_v1, v1Device, fileSize_v1, ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v1.bin", v1Host, fileSize_v1); - -cleanup: - aclrtFree(v1Device); - aclrtFreeHost(v1Host); - if (stream != nullptr) { - const aclError _ret = aclrtDestroyStream(stream); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtDestroyStream(stream)", (int)_ret, __FILE__, __LINE__); - } - stream = nullptr; - } - if (deviceSet) { - const aclError _ret = aclrtResetDevice(deviceId); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtResetDevice(deviceId)", (int)_ret, __FILE__, __LINE__); - } - } - if (aclInited) { - const aclError _ret = aclFinalize(); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclFinalize()", (int)_ret, __FILE__, __LINE__); - } - } - - return rc; -} diff --git a/test/vpto/cases/micro-op/materialization-predicate/pnot/compare.py b/test/vpto/cases/micro-op/materialization-predicate/pnot/compare.py deleted file mode 100755 index a75c98c84..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/pnot/compare.py +++ /dev/null @@ -1,64 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/materialization-predicate/pnot -# family: materialization-predicate -# target_ops: pto.pnot -# scenarios: predicate-transform, logical-not -# coding=utf-8 - -import os -import sys - -import numpy as np - - -EXPECTED_WORDS = 32 -PREFIX_WORDS = 8 - - -def compare_words(golden_path, output_path): - if not os.path.exists(output_path) or not os.path.exists(golden_path): - print(f"[ERROR] Missing file: golden={golden_path} out={output_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint32) - output = np.fromfile(output_path, dtype=np.uint32) - if golden.size != EXPECTED_WORDS or output.size != EXPECTED_WORDS: - print( - f"[ERROR] Unexpected word count: golden={golden.size} " - f"out={output.size} expected={EXPECTED_WORDS}" - ) - return False - golden = golden[:PREFIX_WORDS] - output = output[:PREFIX_WORDS] - if not np.array_equal(golden, output): - diff = np.nonzero(golden != output)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed predicate words): idx={idx} " - f"golden={int(golden[idx])} out={int(output[idx])}" - ) - return False - return True - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_words("golden_v1.bin", "v1.bin") - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/materialization-predicate/pnot/golden.py b/test/vpto/cases/micro-op/materialization-predicate/pnot/golden.py deleted file mode 100755 index cafe4d7d0..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/pnot/golden.py +++ /dev/null @@ -1,49 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/materialization-predicate/pnot -# family: materialization-predicate -# target_ops: pto.pnot -# scenarios: predicate-transform, logical-not -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -SEED = 19 -OUTPUT_WORDS = 32 -GOLDEN_PREFIX_WORDS = np.array([0, 286261248, 286331153, 286331153, 286331153, 286331153, 286331153, 286331153], dtype=np.uint32) - - -def generate(output_dir: Path, seed: int) -> None: - del seed - output_init = np.zeros((OUTPUT_WORDS,), dtype=np.uint32) - golden = np.zeros((OUTPUT_WORDS,), dtype=np.uint32) - golden[: GOLDEN_PREFIX_WORDS.size] = GOLDEN_PREFIX_WORDS - - output_dir.mkdir(parents=True, exist_ok=True) - output_init.tofile(output_dir / "v1.bin") - golden.tofile(output_dir / "golden_v1.bin") - - -def main() -> None: - parser = argparse.ArgumentParser( - description="Generate packed predicate golden for VPTO micro-op validation." - ) - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/materialization-predicate/pnot/kernel.pto b/test/vpto/cases/micro-op/materialization-predicate/pnot/kernel.pto deleted file mode 100644 index 02e6b942e..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/pnot/kernel.pto +++ /dev/null @@ -1,37 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/materialization-predicate/pnot -// family: materialization-predicate -// target_ops: pto.pnot -// scenarios: predicate-transform -// ----------------------------------------------------------------------------- -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @pnot_kernel_2d(%arg0: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c32 = arith.constant 32 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c13 = arith.constant 13 : i32 - - %ub_out = pto.castptr %c0_i64 : i64 -> !pto.ptr - %gm_out = pto.castptr %arg0 : !pto.ptr -> !pto.ptr - - pto.vecscope { - scf.for %iter = %c0 to %c1 step %c1 { - %all = pto.pset_b32 "PAT_ALL" : !pto.mask - %half, %next = pto.plt_b32 %c13 : i32 -> !pto.mask, i32 - %out = pto.pnot %half, %all : !pto.mask, !pto.mask -> !pto.mask - pto.psts %out, %ub_out[%c0], "NORM" : !pto.mask, !pto.ptr, index - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %gm_out, %c32_i64 - nburst(%c1_i64, %c32_i64, %c32_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/materialization-predicate/pnot/launch.cpp b/test/vpto/cases/micro-op/materialization-predicate/pnot/launch.cpp deleted file mode 100644 index 50cf29220..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/pnot/launch.cpp +++ /dev/null @@ -1,48 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/materialization-predicate/pnot -// family: materialization-predicate -// target_ops: pto.pnot -// scenarios: predicate-transform -// --------------------------------------------------------------------------- -// PTOAS compatibility layer -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void pnot_kernel_2d(__gm__ uint32_t *v1); - -void LaunchPnot(uint32_t *v1, void *stream) { - pnot_kernel_2d<<<1, nullptr, stream>>>((__gm__ uint32_t *)v1); -} diff --git a/test/vpto/cases/micro-op/materialization-predicate/pnot/main.cpp b/test/vpto/cases/micro-op/materialization-predicate/pnot/main.cpp deleted file mode 100644 index 64b153376..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/pnot/main.cpp +++ /dev/null @@ -1,104 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/materialization-predicate/pnot -// family: materialization-predicate -// target_ops: pto.pnot -// scenarios: predicate-transform -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -This program is free software, you can redistribute it and/or modify it under the terms and conditions of -CANN Open Software License Agreement Version 2.0 (the "License"). -Please refer to the License for details. You may not use this file except in compliance with the License. -THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -See LICENSE in the root of the software repository for the full text of the License. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include -#include - -using namespace PtoTestCommon; - -#ifndef TMRGSORT_HPP -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif - -#define ACL_CHECK(expr) do { const aclError _ret = (expr); if (_ret != ACL_SUCCESS) { std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, (int)_ret, __FILE__, __LINE__); const char *_recent = aclGetRecentErrMsg(); if (_recent != nullptr && _recent[0] != '\0') { std::fprintf(stderr, "[ERROR] RecentErrMsg: %s\n", _recent); } rc = 1; goto cleanup; } } while (0) - -void LaunchPnot(uint32_t *v1, void *stream); - -int main() { - size_t elemCount_v1 = 32; - size_t fileSize_v1 = elemCount_v1 * sizeof(uint32_t); - uint32_t *v1Host = nullptr; - uint32_t *v1Device = nullptr; - - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { - deviceId = std::atoi(envDevice); - } - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchPnot(v1Device, stream); - - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v1Host, fileSize_v1, v1Device, fileSize_v1, ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v1.bin", v1Host, fileSize_v1); - -cleanup: - aclrtFree(v1Device); - aclrtFreeHost(v1Host); - if (stream != nullptr) { - const aclError _ret = aclrtDestroyStream(stream); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtDestroyStream(stream)", (int)_ret, __FILE__, __LINE__); - } - stream = nullptr; - } - if (deviceSet) { - const aclError _ret = aclrtResetDevice(deviceId); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtResetDevice(deviceId)", (int)_ret, __FILE__, __LINE__); - } - } - if (aclInited) { - const aclError _ret = aclFinalize(); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclFinalize()", (int)_ret, __FILE__, __LINE__); - } - } - - return rc; -} diff --git a/test/vpto/cases/micro-op/materialization-predicate/por/compare.py b/test/vpto/cases/micro-op/materialization-predicate/por/compare.py deleted file mode 100755 index 2d6c341a8..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/por/compare.py +++ /dev/null @@ -1,64 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/materialization-predicate/por -# family: materialization-predicate -# target_ops: pto.por -# scenarios: predicate-transform, logical-or -# coding=utf-8 - -import os -import sys - -import numpy as np - - -EXPECTED_WORDS = 32 -PREFIX_WORDS = 8 - - -def compare_words(golden_path, output_path): - if not os.path.exists(output_path) or not os.path.exists(golden_path): - print(f"[ERROR] Missing file: golden={golden_path} out={output_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint32) - output = np.fromfile(output_path, dtype=np.uint32) - if golden.size != EXPECTED_WORDS or output.size != EXPECTED_WORDS: - print( - f"[ERROR] Unexpected word count: golden={golden.size} " - f"out={output.size} expected={EXPECTED_WORDS}" - ) - return False - golden = golden[:PREFIX_WORDS] - output = output[:PREFIX_WORDS] - if not np.array_equal(golden, output): - diff = np.nonzero(golden != output)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed predicate words): idx={idx} " - f"golden={int(golden[idx])} out={int(output[idx])}" - ) - return False - return True - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_words("golden_v1.bin", "v1.bin") - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/materialization-predicate/por/golden.py b/test/vpto/cases/micro-op/materialization-predicate/por/golden.py deleted file mode 100755 index c9c5dfe1e..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/por/golden.py +++ /dev/null @@ -1,62 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/materialization-predicate/por -# family: materialization-predicate -# target_ops: pto.por -# scenarios: predicate-transform, logical-or -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -SEED = 19 -OUTPUT_WORDS = 32 -PREFIX_BITS = 13 -SUFFIX_BITS = 7 -PREDICATE_BITS = 256 -NIBBLE_COUNT = PREDICATE_BITS // 2 - - -def pack_nibbles(nibbles: np.ndarray) -> np.ndarray: - words = np.zeros((OUTPUT_WORDS,), dtype=np.uint32) - for idx, nibble in enumerate(nibbles): - words[idx // 8] |= np.uint32(int(nibble) & 0xF) << np.uint32((idx % 8) * 4) - return words - - -def generate(output_dir: Path, seed: int) -> None: - del seed - output_init = np.zeros((OUTPUT_WORDS,), dtype=np.uint32) - lhs = np.zeros((NIBBLE_COUNT,), dtype=np.uint8) - rhs = np.zeros((NIBBLE_COUNT,), dtype=np.uint8) - lhs[:PREFIX_BITS] = 1 - rhs[:SUFFIX_BITS] = 1 - golden = pack_nibbles(lhs | rhs) - - output_dir.mkdir(parents=True, exist_ok=True) - output_init.tofile(output_dir / "v1.bin") - golden.tofile(output_dir / "golden_v1.bin") - - -def main() -> None: - parser = argparse.ArgumentParser( - description="Generate packed predicate golden for VPTO micro-op validation." - ) - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/materialization-predicate/por/kernel.pto b/test/vpto/cases/micro-op/materialization-predicate/por/kernel.pto deleted file mode 100644 index 3eea2cb87..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/por/kernel.pto +++ /dev/null @@ -1,56 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/materialization-predicate/por -// family: materialization-predicate -// target_ops: pto.por -// scenarios: predicate-transform -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -// ============================================================================= -// abs_kernel_2d: Element-wise absolute value on a 32x32 f32 tile -// ============================================================================= -// This kernel computes abs(input) for a 32x32 float32 matrix (1024 elements). -// -// Memory Layout: -// - Input: arg0 -> GM (Global Memory) -// - Output: arg1 -> GM (Global Memory) -// - UB (Unified Buffer) at offset 0: input tile (4096 bytes = 32*32*4) -// - UB at offset 4096: output tile (4096 bytes = 32*32*4) -// -// Pipeline: -// 1. DMA: GM -> UB (MTE2 pipe) - copy input tile to UB -// 2. Sync: wait for MTE2 -> V pipe handoff -// 3. Compute: vabs on 64-element vectors (V pipe) - 16 iterations for 1024 elements -// 4. Sync: wait for V -> MTE3 pipe handoff -// 5. DMA: UB -> GM (MTE3 pipe) - copy result tile back to GM -// ============================================================================= - -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @por_kernel_2d(%arg0: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c7 = arith.constant 7 : i32 - %c13 = arith.constant 13 : i32 - - %ub_out = pto.castptr %c0_i64 : i64 -> !pto.ptr - %gm_out = pto.castptr %arg0 : !pto.ptr -> !pto.ptr - - pto.vecscope { - %all = pto.pset_b32 "PAT_ALL" : !pto.mask - %lhs, %lhs_next = pto.plt_b32 %c13 : i32 -> !pto.mask, i32 - %rhs, %rhs_next = pto.plt_b32 %c7 : i32 -> !pto.mask, i32 - %out = pto.por %lhs, %rhs, %all : !pto.mask, !pto.mask, !pto.mask -> !pto.mask - pto.psts %out, %ub_out[%c0], "NORM" : !pto.mask, !pto.ptr, index - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %gm_out, %c32_i64 - nburst(%c1_i64, %c32_i64, %c32_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/materialization-predicate/por/launch.cpp b/test/vpto/cases/micro-op/materialization-predicate/por/launch.cpp deleted file mode 100644 index caa8684a4..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/por/launch.cpp +++ /dev/null @@ -1,48 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/materialization-predicate/por -// family: materialization-predicate -// target_ops: pto.por -// scenarios: predicate-transform -// --------------------------------------------------------------------------- -// PTOAS compatibility layer -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void por_kernel_2d(__gm__ uint32_t *v1); - -void LaunchPor(uint32_t *v1, void *stream) { - por_kernel_2d<<<1, nullptr, stream>>>((__gm__ uint32_t *)v1); -} diff --git a/test/vpto/cases/micro-op/materialization-predicate/por/main.cpp b/test/vpto/cases/micro-op/materialization-predicate/por/main.cpp deleted file mode 100644 index 527116eff..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/por/main.cpp +++ /dev/null @@ -1,104 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/materialization-predicate/por -// family: materialization-predicate -// target_ops: pto.por -// scenarios: predicate-transform -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -This program is free software, you can redistribute it and/or modify it under the terms and conditions of -CANN Open Software License Agreement Version 2.0 (the "License"). -Please refer to the License for details. You may not use this file except in compliance with the License. -THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -See LICENSE in the root of the software repository for the full text of the License. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include -#include - -using namespace PtoTestCommon; - -#ifndef TMRGSORT_HPP -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif - -#define ACL_CHECK(expr) do { const aclError _ret = (expr); if (_ret != ACL_SUCCESS) { std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, (int)_ret, __FILE__, __LINE__); const char *_recent = aclGetRecentErrMsg(); if (_recent != nullptr && _recent[0] != '\0') { std::fprintf(stderr, "[ERROR] RecentErrMsg: %s\n", _recent); } rc = 1; goto cleanup; } } while (0) - -void LaunchPor(uint32_t *v1, void *stream); - -int main() { - size_t elemCount_v1 = 32; - size_t fileSize_v1 = elemCount_v1 * sizeof(uint32_t); - uint32_t *v1Host = nullptr; - uint32_t *v1Device = nullptr; - - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { - deviceId = std::atoi(envDevice); - } - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchPor(v1Device, stream); - - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v1Host, fileSize_v1, v1Device, fileSize_v1, ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v1.bin", v1Host, fileSize_v1); - -cleanup: - aclrtFree(v1Device); - aclrtFreeHost(v1Host); - if (stream != nullptr) { - const aclError _ret = aclrtDestroyStream(stream); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtDestroyStream(stream)", (int)_ret, __FILE__, __LINE__); - } - stream = nullptr; - } - if (deviceSet) { - const aclError _ret = aclrtResetDevice(deviceId); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtResetDevice(deviceId)", (int)_ret, __FILE__, __LINE__); - } - } - if (aclInited) { - const aclError _ret = aclFinalize(); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclFinalize()", (int)_ret, __FILE__, __LINE__); - } - } - - return rc; -} diff --git a/test/vpto/cases/micro-op/materialization-predicate/ppack-punpack-nontrivial/compare.py b/test/vpto/cases/micro-op/materialization-predicate/ppack-punpack-nontrivial/compare.py deleted file mode 100755 index 2585ff1e4..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/ppack-punpack-nontrivial/compare.py +++ /dev/null @@ -1,64 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/materialization-predicate/ppack-punpack-nontrivial -# family: materialization-predicate -# target_ops: pto.ppack, pto.punpack -# scenarios: pack-unpack-roundtrip, representative-logical-elements -# coding=utf-8 - -import os -import sys - -import numpy as np - - -EXPECTED_WORDS = 32 -PREFIX_WORDS = 16 - - -def compare_words(golden_path, output_path): - if not os.path.exists(output_path) or not os.path.exists(golden_path): - print(f"[ERROR] Missing file: golden={golden_path} out={output_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint32) - output = np.fromfile(output_path, dtype=np.uint32) - if golden.size != EXPECTED_WORDS or output.size != EXPECTED_WORDS: - print( - f"[ERROR] Unexpected word count: golden={golden.size} " - f"out={output.size} expected={EXPECTED_WORDS}" - ) - return False - golden = golden[:PREFIX_WORDS] - output = output[:PREFIX_WORDS] - if not np.array_equal(golden, output): - diff = np.nonzero(golden != output)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed predicate words): idx={idx} " - f"golden={int(golden[idx])} out={int(output[idx])}" - ) - return False - return True - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_words("golden_v1.bin", "v1.bin") - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/materialization-predicate/ppack-punpack-nontrivial/golden.py b/test/vpto/cases/micro-op/materialization-predicate/ppack-punpack-nontrivial/golden.py deleted file mode 100755 index 7923992f9..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/ppack-punpack-nontrivial/golden.py +++ /dev/null @@ -1,49 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/materialization-predicate/ppack-punpack-nontrivial -# family: materialization-predicate -# target_ops: pto.ppack, pto.punpack -# scenarios: pack-unpack-roundtrip, representative-logical-elements -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -SEED = 19 -OUTPUT_WORDS = 32 -GOLDEN_PREFIX_WORDS = np.array([16843009, 16843009, 16843009, 16843009, 0, 0, 0, 0, 65537, 65537, 65537, 65537, 65537, 65537, 65537, 65537], dtype=np.uint32) - - -def generate(output_dir: Path, seed: int) -> None: - del seed - output_init = np.zeros((OUTPUT_WORDS,), dtype=np.uint32) - golden = np.zeros((OUTPUT_WORDS,), dtype=np.uint32) - golden[: GOLDEN_PREFIX_WORDS.size] = GOLDEN_PREFIX_WORDS - - output_dir.mkdir(parents=True, exist_ok=True) - output_init.tofile(output_dir / "v1.bin") - golden.tofile(output_dir / "golden_v1.bin") - - -def main() -> None: - parser = argparse.ArgumentParser( - description="Generate packed predicate golden for VPTO micro-op validation." - ) - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/materialization-predicate/ppack-punpack-nontrivial/kernel.pto b/test/vpto/cases/micro-op/materialization-predicate/ppack-punpack-nontrivial/kernel.pto deleted file mode 100644 index b387f0b78..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/ppack-punpack-nontrivial/kernel.pto +++ /dev/null @@ -1,37 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/materialization-predicate/ppack-punpack-nontrivial -// family: materialization-predicate -// target_ops: pto.ppack, pto.punpack -// scenarios: pack-unpack-roundtrip, nontrivial-pattern -// ----------------------------------------------------------------------------- -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @ppack_punpack_nontrivial_kernel_2d(%arg0: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c32 = arith.constant 32 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c64_i64 = arith.constant 64 : i64 - - %ub_out = pto.castptr %c0_i64 : i64 -> !pto.ptr - %gm_out = pto.castptr %arg0 : !pto.ptr -> !pto.ptr - - pto.vecscope { - scf.for %iter = %c0 to %c1 step %c1 { - %src = pto.pset_b32 "PAT_M4" : !pto.mask - %packed = pto.ppack %src, "LOWER" : !pto.mask -> !pto.mask - %roundtrip = pto.punpack %packed, "LOWER" : !pto.mask -> !pto.mask - pto.psts %packed, %ub_out[%c0], "NORM" : !pto.mask, !pto.ptr, index - pto.psts %roundtrip, %ub_out[%c32], "NORM" : !pto.mask, !pto.ptr, index - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %gm_out, %c64_i64 - nburst(%c1_i64, %c64_i64, %c64_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/materialization-predicate/ppack-punpack-nontrivial/launch.cpp b/test/vpto/cases/micro-op/materialization-predicate/ppack-punpack-nontrivial/launch.cpp deleted file mode 100644 index aac69efd1..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/ppack-punpack-nontrivial/launch.cpp +++ /dev/null @@ -1,48 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/materialization-predicate/ppack-punpack-nontrivial -// family: materialization-predicate -// target_ops: pto.ppack, pto.punpack -// scenarios: pack-unpack-roundtrip, nontrivial-pattern -// --------------------------------------------------------------------------- -// PTOAS compatibility layer -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void ppack_punpack_nontrivial_kernel_2d(__gm__ uint32_t *v1); - -void LaunchPpackPunpackNontrivial(uint32_t *v1, void *stream) { - ppack_punpack_nontrivial_kernel_2d<<<1, nullptr, stream>>>((__gm__ uint32_t *)v1); -} diff --git a/test/vpto/cases/micro-op/materialization-predicate/ppack-punpack-nontrivial/main.cpp b/test/vpto/cases/micro-op/materialization-predicate/ppack-punpack-nontrivial/main.cpp deleted file mode 100644 index 30feafd01..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/ppack-punpack-nontrivial/main.cpp +++ /dev/null @@ -1,104 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/materialization-predicate/ppack-punpack-nontrivial -// family: materialization-predicate -// target_ops: pto.ppack, pto.punpack -// scenarios: pack-unpack-roundtrip, nontrivial-pattern -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -This program is free software, you can redistribute it and/or modify it under the terms and conditions of -CANN Open Software License Agreement Version 2.0 (the "License"). -Please refer to the License for details. You may not use this file except in compliance with the License. -THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -See LICENSE in the root of the software repository for the full text of the License. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include -#include - -using namespace PtoTestCommon; - -#ifndef TMRGSORT_HPP -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif - -#define ACL_CHECK(expr) do { const aclError _ret = (expr); if (_ret != ACL_SUCCESS) { std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, (int)_ret, __FILE__, __LINE__); const char *_recent = aclGetRecentErrMsg(); if (_recent != nullptr && _recent[0] != '\0') { std::fprintf(stderr, "[ERROR] RecentErrMsg: %s\n", _recent); } rc = 1; goto cleanup; } } while (0) - -void LaunchPpackPunpackNontrivial(uint32_t *v1, void *stream); - -int main() { - size_t elemCount_v1 = 32; - size_t fileSize_v1 = elemCount_v1 * sizeof(uint32_t); - uint32_t *v1Host = nullptr; - uint32_t *v1Device = nullptr; - - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { - deviceId = std::atoi(envDevice); - } - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchPpackPunpackNontrivial(v1Device, stream); - - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v1Host, fileSize_v1, v1Device, fileSize_v1, ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v1.bin", v1Host, fileSize_v1); - -cleanup: - aclrtFree(v1Device); - aclrtFreeHost(v1Host); - if (stream != nullptr) { - const aclError _ret = aclrtDestroyStream(stream); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtDestroyStream(stream)", (int)_ret, __FILE__, __LINE__); - } - stream = nullptr; - } - if (deviceSet) { - const aclError _ret = aclrtResetDevice(deviceId); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtResetDevice(deviceId)", (int)_ret, __FILE__, __LINE__); - } - } - if (aclInited) { - const aclError _ret = aclFinalize(); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclFinalize()", (int)_ret, __FILE__, __LINE__); - } - } - - return rc; -} diff --git a/test/vpto/cases/micro-op/materialization-predicate/ppack-punpack/compare.py b/test/vpto/cases/micro-op/materialization-predicate/ppack-punpack/compare.py deleted file mode 100755 index 08e54575d..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/ppack-punpack/compare.py +++ /dev/null @@ -1,64 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/materialization-predicate/ppack-punpack -# family: materialization-predicate -# target_ops: pto.ppack, pto.punpack -# scenarios: pack-unpack-roundtrip -# coding=utf-8 - -import os -import sys - -import numpy as np - - -EXPECTED_WORDS = 32 -PREFIX_WORDS = 16 - - -def compare_words(golden_path, output_path): - if not os.path.exists(output_path) or not os.path.exists(golden_path): - print(f"[ERROR] Missing file: golden={golden_path} out={output_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint32) - output = np.fromfile(output_path, dtype=np.uint32) - if golden.size != EXPECTED_WORDS or output.size != EXPECTED_WORDS: - print( - f"[ERROR] Unexpected word count: golden={golden.size} " - f"out={output.size} expected={EXPECTED_WORDS}" - ) - return False - golden = golden[:PREFIX_WORDS] - output = output[:PREFIX_WORDS] - if not np.array_equal(golden, output): - diff = np.nonzero(golden != output)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed predicate words): idx={idx} " - f"golden={int(golden[idx])} out={int(output[idx])}" - ) - return False - return True - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_words("golden_v1.bin", "v1.bin") - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/materialization-predicate/ppack-punpack/golden.py b/test/vpto/cases/micro-op/materialization-predicate/ppack-punpack/golden.py deleted file mode 100755 index 05d8b350b..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/ppack-punpack/golden.py +++ /dev/null @@ -1,49 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/materialization-predicate/ppack-punpack -# family: materialization-predicate -# target_ops: pto.ppack, pto.punpack -# scenarios: pack-unpack-roundtrip -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -SEED = 19 -OUTPUT_WORDS = 32 -GOLDEN_PREFIX_WORDS = np.array([1431655765, 1431655765, 1431655765, 1431655765, 0, 0, 0, 0, 286331153, 286331153, 286331153, 286331153, 286331153, 286331153, 286331153, 286331153], dtype=np.uint32) - - -def generate(output_dir: Path, seed: int) -> None: - del seed - output_init = np.zeros((OUTPUT_WORDS,), dtype=np.uint32) - golden = np.zeros((OUTPUT_WORDS,), dtype=np.uint32) - golden[: GOLDEN_PREFIX_WORDS.size] = GOLDEN_PREFIX_WORDS - - output_dir.mkdir(parents=True, exist_ok=True) - output_init.tofile(output_dir / "v1.bin") - golden.tofile(output_dir / "golden_v1.bin") - - -def main() -> None: - parser = argparse.ArgumentParser( - description="Generate packed predicate golden for VPTO micro-op validation." - ) - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/materialization-predicate/ppack-punpack/kernel.pto b/test/vpto/cases/micro-op/materialization-predicate/ppack-punpack/kernel.pto deleted file mode 100644 index 3e701025c..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/ppack-punpack/kernel.pto +++ /dev/null @@ -1,37 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/materialization-predicate/ppack-punpack -// family: materialization-predicate -// target_ops: pto.ppack, pto.punpack -// scenarios: pack-unpack-roundtrip -// ----------------------------------------------------------------------------- -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @ppack_punpack_kernel_2d(%arg0: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c32 = arith.constant 32 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c64_i64 = arith.constant 64 : i64 - - %ub_out = pto.castptr %c0_i64 : i64 -> !pto.ptr - %gm_out = pto.castptr %arg0 : !pto.ptr -> !pto.ptr - - pto.vecscope { - scf.for %iter = %c0 to %c1 step %c1 { - %src = pto.pset_b32 "PAT_ALL" : !pto.mask - %packed = pto.ppack %src, "LOWER" : !pto.mask -> !pto.mask - %roundtrip = pto.punpack %packed, "LOWER" : !pto.mask -> !pto.mask - pto.psts %packed, %ub_out[%c0], "NORM" : !pto.mask, !pto.ptr, index - pto.psts %roundtrip, %ub_out[%c32], "NORM" : !pto.mask, !pto.ptr, index - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %gm_out, %c64_i64 - nburst(%c1_i64, %c64_i64, %c64_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/materialization-predicate/ppack-punpack/launch.cpp b/test/vpto/cases/micro-op/materialization-predicate/ppack-punpack/launch.cpp deleted file mode 100644 index 2dc4b848d..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/ppack-punpack/launch.cpp +++ /dev/null @@ -1,48 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/materialization-predicate/ppack-punpack -// family: materialization-predicate -// target_ops: pto.ppack, pto.punpack -// scenarios: pack-unpack-roundtrip -// --------------------------------------------------------------------------- -// PTOAS compatibility layer -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void ppack_punpack_kernel_2d(__gm__ uint32_t *v1); - -void LaunchPpackPunpack(uint32_t *v1, void *stream) { - ppack_punpack_kernel_2d<<<1, nullptr, stream>>>((__gm__ uint32_t *)v1); -} diff --git a/test/vpto/cases/micro-op/materialization-predicate/ppack-punpack/main.cpp b/test/vpto/cases/micro-op/materialization-predicate/ppack-punpack/main.cpp deleted file mode 100644 index 0ad47bb5d..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/ppack-punpack/main.cpp +++ /dev/null @@ -1,104 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/materialization-predicate/ppack-punpack -// family: materialization-predicate -// target_ops: pto.ppack, pto.punpack -// scenarios: pack-unpack-roundtrip -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -This program is free software, you can redistribute it and/or modify it under the terms and conditions of -CANN Open Software License Agreement Version 2.0 (the "License"). -Please refer to the License for details. You may not use this file except in compliance with the License. -THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -See LICENSE in the root of the software repository for the full text of the License. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include -#include - -using namespace PtoTestCommon; - -#ifndef TMRGSORT_HPP -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif - -#define ACL_CHECK(expr) do { const aclError _ret = (expr); if (_ret != ACL_SUCCESS) { std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, (int)_ret, __FILE__, __LINE__); const char *_recent = aclGetRecentErrMsg(); if (_recent != nullptr && _recent[0] != '\0') { std::fprintf(stderr, "[ERROR] RecentErrMsg: %s\n", _recent); } rc = 1; goto cleanup; } } while (0) - -void LaunchPpackPunpack(uint32_t *v1, void *stream); - -int main() { - size_t elemCount_v1 = 32; - size_t fileSize_v1 = elemCount_v1 * sizeof(uint32_t); - uint32_t *v1Host = nullptr; - uint32_t *v1Device = nullptr; - - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { - deviceId = std::atoi(envDevice); - } - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchPpackPunpack(v1Device, stream); - - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v1Host, fileSize_v1, v1Device, fileSize_v1, ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v1.bin", v1Host, fileSize_v1); - -cleanup: - aclrtFree(v1Device); - aclrtFreeHost(v1Host); - if (stream != nullptr) { - const aclError _ret = aclrtDestroyStream(stream); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtDestroyStream(stream)", (int)_ret, __FILE__, __LINE__); - } - stream = nullptr; - } - if (deviceSet) { - const aclError _ret = aclrtResetDevice(deviceId); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtResetDevice(deviceId)", (int)_ret, __FILE__, __LINE__); - } - } - if (aclInited) { - const aclError _ret = aclFinalize(); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclFinalize()", (int)_ret, __FILE__, __LINE__); - } - } - - return rc; -} diff --git a/test/vpto/cases/micro-op/materialization-predicate/psel-tail-predicate/compare.py b/test/vpto/cases/micro-op/materialization-predicate/psel-tail-predicate/compare.py deleted file mode 100755 index d334ff512..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/psel-tail-predicate/compare.py +++ /dev/null @@ -1,64 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/materialization-predicate/psel-tail-predicate -# family: materialization-predicate -# target_ops: pto.psel -# scenarios: predicate-transform, predicate-select, tail-mask -# coding=utf-8 - -import os -import sys - -import numpy as np - - -EXPECTED_WORDS = 32 -PREFIX_WORDS = 16 - - -def compare_words(golden_path, output_path): - if not os.path.exists(output_path) or not os.path.exists(golden_path): - print(f"[ERROR] Missing file: golden={golden_path} out={output_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint32) - output = np.fromfile(output_path, dtype=np.uint32) - if golden.size != EXPECTED_WORDS or output.size != EXPECTED_WORDS: - print( - f"[ERROR] Unexpected word count: golden={golden.size} " - f"out={output.size} expected={EXPECTED_WORDS}" - ) - return False - golden = golden[:PREFIX_WORDS] - output = output[:PREFIX_WORDS] - if not np.array_equal(golden, output): - diff = np.nonzero(golden != output)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed predicate words): idx={idx} " - f"golden={int(golden[idx])} out={int(output[idx])}" - ) - return False - return True - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_words("golden_v1.bin", "v1.bin") - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/materialization-predicate/psel-tail-predicate/golden.py b/test/vpto/cases/micro-op/materialization-predicate/psel-tail-predicate/golden.py deleted file mode 100755 index 0144a0f36..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/psel-tail-predicate/golden.py +++ /dev/null @@ -1,49 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/materialization-predicate/psel-tail-predicate -# family: materialization-predicate -# target_ops: pto.psel -# scenarios: predicate-transform, predicate-select, tail-mask -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -SEED = 19 -OUTPUT_WORDS = 32 -GOLDEN_PREFIX_WORDS = np.array([286331153, 69905, 0, 0, 0, 0, 0, 0, 286331153, 286331153, 286331153, 286331153, 286331153, 286331153, 286331153, 286331153], dtype=np.uint32) - - -def generate(output_dir: Path, seed: int) -> None: - del seed - output_init = np.zeros((OUTPUT_WORDS,), dtype=np.uint32) - golden = np.zeros((OUTPUT_WORDS,), dtype=np.uint32) - golden[: GOLDEN_PREFIX_WORDS.size] = GOLDEN_PREFIX_WORDS - - output_dir.mkdir(parents=True, exist_ok=True) - output_init.tofile(output_dir / "v1.bin") - golden.tofile(output_dir / "golden_v1.bin") - - -def main() -> None: - parser = argparse.ArgumentParser( - description="Generate packed predicate golden for VPTO micro-op validation." - ) - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/materialization-predicate/psel-tail-predicate/kernel.pto b/test/vpto/cases/micro-op/materialization-predicate/psel-tail-predicate/kernel.pto deleted file mode 100644 index 4dab858c2..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/psel-tail-predicate/kernel.pto +++ /dev/null @@ -1,41 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/materialization-predicate/psel-tail-predicate -// family: materialization-predicate -// target_ops: pto.psel -// scenarios: predicate-transform, predicate-select, tail-mask -// ----------------------------------------------------------------------------- -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @psel_tail_kernel_2d(%arg0: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c32 = arith.constant 32 : index - %c64 = arith.constant 64 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c64_i64 = arith.constant 64 : i64 - %c13 = arith.constant 13 : i32 - - %ub_out = pto.castptr %c0_i64 : i64 -> !pto.ptr - %gm_out = pto.castptr %arg0 : !pto.ptr -> !pto.ptr - - pto.vecscope { - scf.for %iter = %c0 to %c1 step %c1 { - %src0 = pto.pset_b32 "PAT_ALL" : !pto.mask - %sel, %next = pto.plt_b32 %c13 : i32 -> !pto.mask, i32 - %out = pto.psel %src0, %sel, %sel : !pto.mask, !pto.mask, !pto.mask -> !pto.mask - pto.psts %out, %ub_out[%c0], "NORM" : !pto.mask, !pto.ptr, index - %out_next = pto.psel %sel, %src0, %sel : !pto.mask, !pto.mask, !pto.mask -> !pto.mask - pto.psts %out_next, %ub_out[%c32], "NORM" : !pto.mask, !pto.ptr, index - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %gm_out, %c64_i64 - nburst(%c1_i64, %c64_i64, %c64_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/materialization-predicate/psel-tail-predicate/launch.cpp b/test/vpto/cases/micro-op/materialization-predicate/psel-tail-predicate/launch.cpp deleted file mode 100644 index e4c8692cf..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/psel-tail-predicate/launch.cpp +++ /dev/null @@ -1,48 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/materialization-predicate/psel-tail-predicate -// family: materialization-predicate -// target_ops: pto.psel -// scenarios: predicate-transform, predicate-select, tail-mask -// --------------------------------------------------------------------------- -// PTOAS compatibility layer -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void psel_tail_kernel_2d(__gm__ uint32_t *v1); - -void LaunchPselTailPredicate(uint32_t *v1, void *stream) { - psel_tail_kernel_2d<<<1, nullptr, stream>>>((__gm__ uint32_t *)v1); -} diff --git a/test/vpto/cases/micro-op/materialization-predicate/psel-tail-predicate/main.cpp b/test/vpto/cases/micro-op/materialization-predicate/psel-tail-predicate/main.cpp deleted file mode 100644 index 5a5996be6..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/psel-tail-predicate/main.cpp +++ /dev/null @@ -1,104 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/materialization-predicate/psel-tail-predicate -// family: materialization-predicate -// target_ops: pto.psel -// scenarios: predicate-transform, predicate-select, tail-mask -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -This program is free software, you can redistribute it and/or modify it under the terms and conditions of -CANN Open Software License Agreement Version 2.0 (the "License"). -Please refer to the License for details. You may not use this file except in compliance with the License. -THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -See LICENSE in the root of the software repository for the full text of the License. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include -#include - -using namespace PtoTestCommon; - -#ifndef TMRGSORT_HPP -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif - -#define ACL_CHECK(expr) do { const aclError _ret = (expr); if (_ret != ACL_SUCCESS) { std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, (int)_ret, __FILE__, __LINE__); const char *_recent = aclGetRecentErrMsg(); if (_recent != nullptr && _recent[0] != '\0') { std::fprintf(stderr, "[ERROR] RecentErrMsg: %s\n", _recent); } rc = 1; goto cleanup; } } while (0) - -void LaunchPselTailPredicate(uint32_t *v1, void *stream); - -int main() { - size_t elemCount_v1 = 32; - size_t fileSize_v1 = elemCount_v1 * sizeof(uint32_t); - uint32_t *v1Host = nullptr; - uint32_t *v1Device = nullptr; - - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { - deviceId = std::atoi(envDevice); - } - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchPselTailPredicate(v1Device, stream); - - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v1Host, fileSize_v1, v1Device, fileSize_v1, ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v1.bin", v1Host, fileSize_v1); - -cleanup: - aclrtFree(v1Device); - aclrtFreeHost(v1Host); - if (stream != nullptr) { - const aclError _ret = aclrtDestroyStream(stream); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtDestroyStream(stream)", (int)_ret, __FILE__, __LINE__); - } - stream = nullptr; - } - if (deviceSet) { - const aclError _ret = aclrtResetDevice(deviceId); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtResetDevice(deviceId)", (int)_ret, __FILE__, __LINE__); - } - } - if (aclInited) { - const aclError _ret = aclFinalize(); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclFinalize()", (int)_ret, __FILE__, __LINE__); - } - } - - return rc; -} diff --git a/test/vpto/cases/micro-op/materialization-predicate/psel/compare.py b/test/vpto/cases/micro-op/materialization-predicate/psel/compare.py deleted file mode 100755 index fb258a13e..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/psel/compare.py +++ /dev/null @@ -1,64 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/materialization-predicate/psel -# family: materialization-predicate -# target_ops: pto.psel -# scenarios: predicate-transform, predicate-select -# coding=utf-8 - -import os -import sys - -import numpy as np - - -EXPECTED_WORDS = 32 -PREFIX_WORDS = 8 - - -def compare_words(golden_path, output_path): - if not os.path.exists(output_path) or not os.path.exists(golden_path): - print(f"[ERROR] Missing file: golden={golden_path} out={output_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint32) - output = np.fromfile(output_path, dtype=np.uint32) - if golden.size != EXPECTED_WORDS or output.size != EXPECTED_WORDS: - print( - f"[ERROR] Unexpected word count: golden={golden.size} " - f"out={output.size} expected={EXPECTED_WORDS}" - ) - return False - golden = golden[:PREFIX_WORDS] - output = output[:PREFIX_WORDS] - if not np.array_equal(golden, output): - diff = np.nonzero(golden != output)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed predicate words): idx={idx} " - f"golden={int(golden[idx])} out={int(output[idx])}" - ) - return False - return True - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_words("golden_v1.bin", "v1.bin") - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/materialization-predicate/psel/golden.py b/test/vpto/cases/micro-op/materialization-predicate/psel/golden.py deleted file mode 100755 index 101269c58..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/psel/golden.py +++ /dev/null @@ -1,49 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/materialization-predicate/psel -# family: materialization-predicate -# target_ops: pto.psel -# scenarios: predicate-transform, predicate-select -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -SEED = 19 -OUTPUT_WORDS = 32 -GOLDEN_PREFIX_WORDS = np.array([286331153, 69905, 0, 0, 0, 0, 0, 0], dtype=np.uint32) - - -def generate(output_dir: Path, seed: int) -> None: - del seed - output_init = np.zeros((OUTPUT_WORDS,), dtype=np.uint32) - golden = np.zeros((OUTPUT_WORDS,), dtype=np.uint32) - golden[: GOLDEN_PREFIX_WORDS.size] = GOLDEN_PREFIX_WORDS - - output_dir.mkdir(parents=True, exist_ok=True) - output_init.tofile(output_dir / "v1.bin") - golden.tofile(output_dir / "golden_v1.bin") - - -def main() -> None: - parser = argparse.ArgumentParser( - description="Generate packed predicate golden for VPTO micro-op validation." - ) - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/materialization-predicate/psel/kernel.pto b/test/vpto/cases/micro-op/materialization-predicate/psel/kernel.pto deleted file mode 100644 index ebe2e5782..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/psel/kernel.pto +++ /dev/null @@ -1,37 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/materialization-predicate/psel -// family: materialization-predicate -// target_ops: pto.psel -// scenarios: predicate-transform, predicate-select -// ----------------------------------------------------------------------------- -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @psel_kernel_2d(%arg0: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c32 = arith.constant 32 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c13 = arith.constant 13 : i32 - - %ub_out = pto.castptr %c0_i64 : i64 -> !pto.ptr - %gm_out = pto.castptr %arg0 : !pto.ptr -> !pto.ptr - - pto.vecscope { - scf.for %iter = %c0 to %c1 step %c1 { - %src0 = pto.pset_b32 "PAT_ALL" : !pto.mask - %sel, %next = pto.plt_b32 %c13 : i32 -> !pto.mask, i32 - %out = pto.psel %src0, %sel, %sel : !pto.mask, !pto.mask, !pto.mask -> !pto.mask - pto.psts %out, %ub_out[%c0], "NORM" : !pto.mask, !pto.ptr, index - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %gm_out, %c32_i64 - nburst(%c1_i64, %c32_i64, %c32_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/materialization-predicate/psel/launch.cpp b/test/vpto/cases/micro-op/materialization-predicate/psel/launch.cpp deleted file mode 100644 index 34e1641cb..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/psel/launch.cpp +++ /dev/null @@ -1,48 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/materialization-predicate/psel -// family: materialization-predicate -// target_ops: pto.psel -// scenarios: predicate-transform, predicate-select -// --------------------------------------------------------------------------- -// PTOAS compatibility layer -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void psel_kernel_2d(__gm__ uint32_t *v1); - -void LaunchPsel(uint32_t *v1, void *stream) { - psel_kernel_2d<<<1, nullptr, stream>>>((__gm__ uint32_t *)v1); -} diff --git a/test/vpto/cases/micro-op/materialization-predicate/psel/main.cpp b/test/vpto/cases/micro-op/materialization-predicate/psel/main.cpp deleted file mode 100644 index bfb6d3558..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/psel/main.cpp +++ /dev/null @@ -1,104 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/materialization-predicate/psel -// family: materialization-predicate -// target_ops: pto.psel -// scenarios: predicate-transform, predicate-select -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -This program is free software, you can redistribute it and/or modify it under the terms and conditions of -CANN Open Software License Agreement Version 2.0 (the "License"). -Please refer to the License for details. You may not use this file except in compliance with the License. -THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -See LICENSE in the root of the software repository for the full text of the License. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include -#include - -using namespace PtoTestCommon; - -#ifndef TMRGSORT_HPP -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif - -#define ACL_CHECK(expr) do { const aclError _ret = (expr); if (_ret != ACL_SUCCESS) { std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, (int)_ret, __FILE__, __LINE__); const char *_recent = aclGetRecentErrMsg(); if (_recent != nullptr && _recent[0] != '\0') { std::fprintf(stderr, "[ERROR] RecentErrMsg: %s\n", _recent); } rc = 1; goto cleanup; } } while (0) - -void LaunchPsel(uint32_t *v1, void *stream); - -int main() { - size_t elemCount_v1 = 32; - size_t fileSize_v1 = elemCount_v1 * sizeof(uint32_t); - uint32_t *v1Host = nullptr; - uint32_t *v1Device = nullptr; - - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { - deviceId = std::atoi(envDevice); - } - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchPsel(v1Device, stream); - - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v1Host, fileSize_v1, v1Device, fileSize_v1, ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v1.bin", v1Host, fileSize_v1); - -cleanup: - aclrtFree(v1Device); - aclrtFreeHost(v1Host); - if (stream != nullptr) { - const aclError _ret = aclrtDestroyStream(stream); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtDestroyStream(stream)", (int)_ret, __FILE__, __LINE__); - } - stream = nullptr; - } - if (deviceSet) { - const aclError _ret = aclrtResetDevice(deviceId); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtResetDevice(deviceId)", (int)_ret, __FILE__, __LINE__); - } - } - if (aclInited) { - const aclError _ret = aclFinalize(); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclFinalize()", (int)_ret, __FILE__, __LINE__); - } - } - - return rc; -} diff --git a/test/vpto/cases/micro-op/materialization-predicate/pset-pattern-fragment/compare.py b/test/vpto/cases/micro-op/materialization-predicate/pset-pattern-fragment/compare.py deleted file mode 100755 index 2de1b2000..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/pset-pattern-fragment/compare.py +++ /dev/null @@ -1,64 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/materialization-predicate/pset-pattern-fragment -# family: materialization-predicate -# target_ops: pto.pset_b16, pto.pset_b32, pto.pset_b8 -# scenarios: pattern-mask, pat-vl, representative-logical-elements -# coding=utf-8 - -import os -import sys - -import numpy as np - - -EXPECTED_WORDS = 32 -PREFIX_WORDS = 24 - - -def compare_words(golden_path, output_path): - if not os.path.exists(output_path) or not os.path.exists(golden_path): - print(f"[ERROR] Missing file: golden={golden_path} out={output_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint32) - output = np.fromfile(output_path, dtype=np.uint32) - if golden.size != EXPECTED_WORDS or output.size != EXPECTED_WORDS: - print( - f"[ERROR] Unexpected word count: golden={golden.size} " - f"out={output.size} expected={EXPECTED_WORDS}" - ) - return False - golden = golden[:PREFIX_WORDS] - output = output[:PREFIX_WORDS] - if not np.array_equal(golden, output): - diff = np.nonzero(golden != output)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed predicate words): idx={idx} " - f"golden={int(golden[idx])} out={int(output[idx])}" - ) - return False - return True - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_words("golden_v1.bin", "v1.bin") - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/materialization-predicate/pset-pattern-fragment/golden.py b/test/vpto/cases/micro-op/materialization-predicate/pset-pattern-fragment/golden.py deleted file mode 100755 index fcf402e5a..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/pset-pattern-fragment/golden.py +++ /dev/null @@ -1,49 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/materialization-predicate/pset-pattern-fragment -# family: materialization-predicate -# target_ops: pto.pset_b16, pto.pset_b32, pto.pset_b8 -# scenarios: pattern-mask, pat-vl, representative-logical-elements -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -SEED = 19 -OUTPUT_WORDS = 32 -GOLDEN_PREFIX_WORDS = np.array([1227133513, 2454267026, 613566756, 1227133513, 2454267026, 613566756, 1227133513, 2454267026, 1431655765, 1431655765, 1431655765, 1431655765, 0, 0, 0, 0, 286331153, 286331153, 0, 0, 0, 0, 0, 0], dtype=np.uint32) - - -def generate(output_dir: Path, seed: int) -> None: - del seed - output_init = np.zeros((OUTPUT_WORDS,), dtype=np.uint32) - golden = np.zeros((OUTPUT_WORDS,), dtype=np.uint32) - golden[: GOLDEN_PREFIX_WORDS.size] = GOLDEN_PREFIX_WORDS - - output_dir.mkdir(parents=True, exist_ok=True) - output_init.tofile(output_dir / "v1.bin") - golden.tofile(output_dir / "golden_v1.bin") - - -def main() -> None: - parser = argparse.ArgumentParser( - description="Generate packed predicate golden for VPTO micro-op validation." - ) - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/materialization-predicate/pset-pattern-fragment/kernel.pto b/test/vpto/cases/micro-op/materialization-predicate/pset-pattern-fragment/kernel.pto deleted file mode 100644 index e98a0b62c..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/pset-pattern-fragment/kernel.pto +++ /dev/null @@ -1,39 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/materialization-predicate/pset-pattern-fragment -// family: materialization-predicate -// target_ops: pto.pset_b16, pto.pset_b32, pto.pset_b8 -// scenarios: pattern-mask, fragment-pattern -// ----------------------------------------------------------------------------- -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @pset_pattern_fragment_kernel_2d(%arg0: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c32 = arith.constant 32 : index - %c64 = arith.constant 64 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c96_i64 = arith.constant 96 : i64 - - %ub_out = pto.castptr %c0_i64 : i64 -> !pto.ptr - %gm_out = pto.castptr %arg0 : !pto.ptr -> !pto.ptr - - pto.vecscope { - scf.for %iter = %c0 to %c1 step %c1 { - %m0 = pto.pset_b8 "PAT_M3" : !pto.mask - %m1 = pto.pset_b16 "PAT_H" : !pto.mask - %m2 = pto.pset_b32 "PAT_Q" : !pto.mask - pto.psts %m0, %ub_out[%c0], "NORM" : !pto.mask, !pto.ptr, index - pto.psts %m1, %ub_out[%c32], "NORM" : !pto.mask, !pto.ptr, index - pto.psts %m2, %ub_out[%c64], "NORM" : !pto.mask, !pto.ptr, index - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %gm_out, %c96_i64 - nburst(%c1_i64, %c96_i64, %c96_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/materialization-predicate/pset-pattern-fragment/launch.cpp b/test/vpto/cases/micro-op/materialization-predicate/pset-pattern-fragment/launch.cpp deleted file mode 100644 index c74390013..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/pset-pattern-fragment/launch.cpp +++ /dev/null @@ -1,48 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/materialization-predicate/pset-pattern-fragment -// family: materialization-predicate -// target_ops: pto.pset_b16, pto.pset_b32, pto.pset_b8 -// scenarios: pattern-mask, fragment-pattern -// --------------------------------------------------------------------------- -// PTOAS compatibility layer -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void pset_pattern_fragment_kernel_2d(__gm__ uint32_t *v1); - -void LaunchPsetPatternFragment(uint32_t *v1, void *stream) { - pset_pattern_fragment_kernel_2d<<<1, nullptr, stream>>>((__gm__ uint32_t *)v1); -} diff --git a/test/vpto/cases/micro-op/materialization-predicate/pset-pattern-fragment/main.cpp b/test/vpto/cases/micro-op/materialization-predicate/pset-pattern-fragment/main.cpp deleted file mode 100644 index 116e3e7ae..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/pset-pattern-fragment/main.cpp +++ /dev/null @@ -1,104 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/materialization-predicate/pset-pattern-fragment -// family: materialization-predicate -// target_ops: pto.pset_b16, pto.pset_b32, pto.pset_b8 -// scenarios: pattern-mask, fragment-pattern -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -This program is free software, you can redistribute it and/or modify it under the terms and conditions of -CANN Open Software License Agreement Version 2.0 (the "License"). -Please refer to the License for details. You may not use this file except in compliance with the License. -THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -See LICENSE in the root of the software repository for the full text of the License. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include -#include - -using namespace PtoTestCommon; - -#ifndef TMRGSORT_HPP -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif - -#define ACL_CHECK(expr) do { const aclError _ret = (expr); if (_ret != ACL_SUCCESS) { std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, (int)_ret, __FILE__, __LINE__); const char *_recent = aclGetRecentErrMsg(); if (_recent != nullptr && _recent[0] != '\0') { std::fprintf(stderr, "[ERROR] RecentErrMsg: %s\n", _recent); } rc = 1; goto cleanup; } } while (0) - -void LaunchPsetPatternFragment(uint32_t *v1, void *stream); - -int main() { - size_t elemCount_v1 = 32; - size_t fileSize_v1 = elemCount_v1 * sizeof(uint32_t); - uint32_t *v1Host = nullptr; - uint32_t *v1Device = nullptr; - - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { - deviceId = std::atoi(envDevice); - } - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchPsetPatternFragment(v1Device, stream); - - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v1Host, fileSize_v1, v1Device, fileSize_v1, ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v1.bin", v1Host, fileSize_v1); - -cleanup: - aclrtFree(v1Device); - aclrtFreeHost(v1Host); - if (stream != nullptr) { - const aclError _ret = aclrtDestroyStream(stream); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtDestroyStream(stream)", (int)_ret, __FILE__, __LINE__); - } - stream = nullptr; - } - if (deviceSet) { - const aclError _ret = aclrtResetDevice(deviceId); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtResetDevice(deviceId)", (int)_ret, __FILE__, __LINE__); - } - } - if (aclInited) { - const aclError _ret = aclFinalize(); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclFinalize()", (int)_ret, __FILE__, __LINE__); - } - } - - return rc; -} diff --git a/test/vpto/cases/micro-op/materialization-predicate/pset-pattern/compare.py b/test/vpto/cases/micro-op/materialization-predicate/pset-pattern/compare.py deleted file mode 100755 index 10290abc6..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/pset-pattern/compare.py +++ /dev/null @@ -1,60 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/materialization-predicate/pset-pattern -# family: materialization-predicate -# target_ops: pto.pset_b16, pto.pset_b32, pto.pset_b8 -# scenarios: pattern-mask, pat-all, pat-vl -# NOTE: bulk-generated coverage skeleton. -# coding=utf-8 - -import os -import sys -import numpy as np - -EXPECTED_WORDS = 24 - - -def compare_packed_words(golden_path, output_path): - if not os.path.exists(golden_path) or not os.path.exists(output_path): - print(f"[ERROR] Missing file: golden={golden_path} out={output_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint32) - output = np.fromfile(output_path, dtype=np.uint32) - if golden.size != EXPECTED_WORDS or output.size != EXPECTED_WORDS: - print( - f"[ERROR] Unexpected word count: golden={golden.size} " - f"out={output.size} expected={EXPECTED_WORDS}" - ) - return False - if not np.array_equal(golden, output): - diff = np.nonzero(golden != output)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed predicate words): idx={idx} " - f"golden={int(golden[idx])} out={int(output[idx])}" - ) - return False - return True - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_packed_words("golden_v1.bin", "v1.bin") - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/materialization-predicate/pset-pattern/golden.py b/test/vpto/cases/micro-op/materialization-predicate/pset-pattern/golden.py deleted file mode 100755 index dcc083810..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/pset-pattern/golden.py +++ /dev/null @@ -1,63 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/materialization-predicate/pset-pattern -# family: materialization-predicate -# target_ops: pto.pset_b16, pto.pset_b32, pto.pset_b8 -# scenarios: pattern-mask, pat-all, pat-vl -# NOTE: bulk-generated coverage skeleton. -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -SEED = 19 -OUTPUT_WORDS = 24 - - -def _pack_pset_prefix(active_lanes: int, bit_stride: int, store_bytes: int) -> np.ndarray: - out = np.zeros((store_bytes,), dtype=np.uint8) - for lane in range(active_lanes): - bit_index = lane * bit_stride - byte_index = bit_index // 8 - bit_in_byte = bit_index % 8 - out[byte_index] |= np.uint8(1 << bit_in_byte) - return out - - -def generate(output_dir: Path, seed: int) -> None: - del seed - output_init = np.zeros((OUTPUT_WORDS,), dtype=np.uint32) - - out = np.zeros((OUTPUT_WORDS * 4,), dtype=np.uint8) - out[0:32] = _pack_pset_prefix(active_lanes=256, bit_stride=1, store_bytes=32) - out[32:48] = _pack_pset_prefix(active_lanes=8, bit_stride=2, store_bytes=16) - out[64:80] = _pack_pset_prefix(active_lanes=16, bit_stride=4, store_bytes=16) - golden = out.view(np.uint32) - - output_dir.mkdir(parents=True, exist_ok=True) - output_init.tofile(output_dir / "v1.bin") - golden.tofile(output_dir / "golden_v1.bin") - - -def main() -> None: - parser = argparse.ArgumentParser( - description="Generate packed predicate golden for VPTO micro-op pset-pattern validation." - ) - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/materialization-predicate/pset-pattern/kernel.pto b/test/vpto/cases/micro-op/materialization-predicate/pset-pattern/kernel.pto deleted file mode 100644 index 42c94f782..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/pset-pattern/kernel.pto +++ /dev/null @@ -1,39 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/materialization-predicate/pset-pattern -// family: materialization-predicate -// target_ops: pto.pset_b16, pto.pset_b32, pto.pset_b8 -// scenarios: pattern-mask, pat-all, pat-vl -// ----------------------------------------------------------------------------- -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @pset_pattern_kernel_2d(%arg0: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c32 = arith.constant 32 : index - %c64 = arith.constant 64 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c96_i64 = arith.constant 96 : i64 - - %ub_out = pto.castptr %c0_i64 : i64 -> !pto.ptr - %gm_out = pto.castptr %arg0 : !pto.ptr -> !pto.ptr - - pto.vecscope { - scf.for %iter = %c0 to %c1 step %c1 { - %m0 = pto.pset_b8 "PAT_ALL" : !pto.mask - %m1 = pto.pset_b16 "PAT_VL8" : !pto.mask - %m2 = pto.pset_b32 "PAT_VL16" : !pto.mask - pto.psts %m0, %ub_out[%c0], "NORM" : !pto.mask, !pto.ptr, index - pto.psts %m1, %ub_out[%c32], "NORM" : !pto.mask, !pto.ptr, index - pto.psts %m2, %ub_out[%c64], "NORM" : !pto.mask, !pto.ptr, index - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %gm_out, %c96_i64 - nburst(%c1_i64, %c96_i64, %c96_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/materialization-predicate/pset-pattern/launch.cpp b/test/vpto/cases/micro-op/materialization-predicate/pset-pattern/launch.cpp deleted file mode 100644 index 01ec8b624..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/pset-pattern/launch.cpp +++ /dev/null @@ -1,69 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/materialization-predicate/pset-pattern -// family: materialization-predicate -// target_ops: pto.pset_b16, pto.pset_b32, pto.pset_b8 -// scenarios: pattern-mask, pat-all, pat-vl -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -// --------------------------------------------------------------------------- -// PTOAS compatibility layer -// -// The upstream pto-isa headers reference some FP8/FP4 types and the -// __VEC_SCOPE__ marker that are not available on every AICore arch/toolchain -// combination (e.g. __NPU_ARCH__==2201). -// -// For our PTOAS-generated kernels we don't rely on these types today, but the -// headers still mention them in templates/static_asserts. Provide minimal -// fallbacks to keep compilation working on dav-c220. -// --------------------------------------------------------------------------- -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include - -// AICore printf support is gated behind `--cce-enable-print` on some -// toolchains. When enabled, include the CCE print header so `cce::printf` -// resolves in device compilation. -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -// Some PTO-ISA types are only available in the __CCE_AICORE__ compilation -// path, but `bisheng -xcce` still performs a host-side parse pass. -// Provide minimal fallbacks only when the corresponding header wasn't -// pulled in by the selected arch implementation. -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void pset_pattern_kernel_2d(__gm__ uint32_t *v1); - -void LaunchPset_pattern_kernel_2d(uint32_t *v1, void *stream) { - pset_pattern_kernel_2d<<<1, nullptr, stream>>>((__gm__ uint32_t *)v1); -} diff --git a/test/vpto/cases/micro-op/materialization-predicate/pset-pattern/main.cpp b/test/vpto/cases/micro-op/materialization-predicate/pset-pattern/main.cpp deleted file mode 100644 index 15a7b4181..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/pset-pattern/main.cpp +++ /dev/null @@ -1,119 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/materialization-predicate/pset-pattern -// family: materialization-predicate -// target_ops: pto.pset_b16, pto.pset_b32, pto.pset_b8 -// scenarios: pattern-mask, pat-all, pat-vl -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -This program is free software, you can redistribute it and/or modify it under the terms and conditions of -CANN Open Software License Agreement Version 2.0 (the "License"). -Please refer to the License for details. You may not use this file except in compliance with the License. -THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -See LICENSE in the root of the software repository for the full text of the License. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include -#include - -using namespace PtoTestCommon; - -#ifndef TMRGSORT_HPP -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, (int)_ret, __FILE__, __LINE__); \ - const char *_recent = aclGetRecentErrMsg(); \ - if (_recent != nullptr && _recent[0] != '\0') { \ - std::fprintf(stderr, "[ERROR] RecentErrMsg: %s\n", _recent); \ - } \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchPset_pattern_kernel_2d(uint32_t *v1, void *stream); - -int main() { - size_t elemCount_v1 = 24; - size_t fileSize_v1 = elemCount_v1 * sizeof(uint32_t); - uint32_t *v1Host = nullptr; - uint32_t *v1Device = nullptr; - - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { - deviceId = std::atoi(envDevice); - } - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchPset_pattern_kernel_2d(v1Device, stream); - - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v1Host, fileSize_v1, v1Device, fileSize_v1, ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v1.bin", v1Host, fileSize_v1); - -cleanup: - aclrtFree(v1Device); - aclrtFreeHost(v1Host); - if (stream != nullptr) { - const aclError _ret = aclrtDestroyStream(stream); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtDestroyStream(stream)", (int)_ret, __FILE__, __LINE__); - } - stream = nullptr; - } - if (deviceSet) { - const aclError _ret = aclrtResetDevice(deviceId); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtResetDevice(deviceId)", (int)_ret, __FILE__, __LINE__); - } - } - if (aclInited) { - const aclError _ret = aclFinalize(); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclFinalize()", (int)_ret, __FILE__, __LINE__); - } - } - - return rc; -} diff --git a/test/vpto/cases/micro-op/materialization-predicate/pxor/compare.py b/test/vpto/cases/micro-op/materialization-predicate/pxor/compare.py deleted file mode 100755 index 0652ae4a5..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/pxor/compare.py +++ /dev/null @@ -1,64 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/materialization-predicate/pxor -# family: materialization-predicate -# target_ops: pto.pxor -# scenarios: predicate-transform, logical-xor -# coding=utf-8 - -import os -import sys - -import numpy as np - - -EXPECTED_WORDS = 32 -PREFIX_WORDS = 8 - - -def compare_words(golden_path, output_path): - if not os.path.exists(output_path) or not os.path.exists(golden_path): - print(f"[ERROR] Missing file: golden={golden_path} out={output_path}") - return False - golden = np.fromfile(golden_path, dtype=np.uint32) - output = np.fromfile(output_path, dtype=np.uint32) - if golden.size != EXPECTED_WORDS or output.size != EXPECTED_WORDS: - print( - f"[ERROR] Unexpected word count: golden={golden.size} " - f"out={output.size} expected={EXPECTED_WORDS}" - ) - return False - golden = golden[:PREFIX_WORDS] - output = output[:PREFIX_WORDS] - if not np.array_equal(golden, output): - diff = np.nonzero(golden != output)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed predicate words): idx={idx} " - f"golden={int(golden[idx])} out={int(output[idx])}" - ) - return False - return True - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_words("golden_v1.bin", "v1.bin") - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/materialization-predicate/pxor/golden.py b/test/vpto/cases/micro-op/materialization-predicate/pxor/golden.py deleted file mode 100755 index 16f212335..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/pxor/golden.py +++ /dev/null @@ -1,62 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/materialization-predicate/pxor -# family: materialization-predicate -# target_ops: pto.pxor -# scenarios: predicate-transform, logical-xor -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -SEED = 19 -OUTPUT_WORDS = 32 -PREFIX_BITS = 13 -SUFFIX_BITS = 7 -PREDICATE_BITS = 256 -NIBBLE_COUNT = PREDICATE_BITS // 2 - - -def pack_nibbles(nibbles: np.ndarray) -> np.ndarray: - words = np.zeros((OUTPUT_WORDS,), dtype=np.uint32) - for idx, nibble in enumerate(nibbles): - words[idx // 8] |= np.uint32(int(nibble) & 0xF) << np.uint32((idx % 8) * 4) - return words - - -def generate(output_dir: Path, seed: int) -> None: - del seed - output_init = np.zeros((OUTPUT_WORDS,), dtype=np.uint32) - lhs = np.zeros((NIBBLE_COUNT,), dtype=np.uint8) - rhs = np.zeros((NIBBLE_COUNT,), dtype=np.uint8) - lhs[:PREFIX_BITS] = 1 - rhs[:SUFFIX_BITS] = 1 - golden = pack_nibbles(np.bitwise_xor(lhs, rhs)) - - output_dir.mkdir(parents=True, exist_ok=True) - output_init.tofile(output_dir / "v1.bin") - golden.tofile(output_dir / "golden_v1.bin") - - -def main() -> None: - parser = argparse.ArgumentParser( - description="Generate packed predicate golden for VPTO micro-op validation." - ) - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/materialization-predicate/pxor/kernel.pto b/test/vpto/cases/micro-op/materialization-predicate/pxor/kernel.pto deleted file mode 100644 index e986c3e74..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/pxor/kernel.pto +++ /dev/null @@ -1,56 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/materialization-predicate/pxor -// family: materialization-predicate -// target_ops: pto.pxor -// scenarios: predicate-transform -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -// ============================================================================= -// abs_kernel_2d: Element-wise absolute value on a 32x32 f32 tile -// ============================================================================= -// This kernel computes abs(input) for a 32x32 float32 matrix (1024 elements). -// -// Memory Layout: -// - Input: arg0 -> GM (Global Memory) -// - Output: arg1 -> GM (Global Memory) -// - UB (Unified Buffer) at offset 0: input tile (4096 bytes = 32*32*4) -// - UB at offset 4096: output tile (4096 bytes = 32*32*4) -// -// Pipeline: -// 1. DMA: GM -> UB (MTE2 pipe) - copy input tile to UB -// 2. Sync: wait for MTE2 -> V pipe handoff -// 3. Compute: vabs on 64-element vectors (V pipe) - 16 iterations for 1024 elements -// 4. Sync: wait for V -> MTE3 pipe handoff -// 5. DMA: UB -> GM (MTE3 pipe) - copy result tile back to GM -// ============================================================================= - -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @pxor_kernel_2d(%arg0: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c7 = arith.constant 7 : i32 - %c13 = arith.constant 13 : i32 - - %ub_out = pto.castptr %c0_i64 : i64 -> !pto.ptr - %gm_out = pto.castptr %arg0 : !pto.ptr -> !pto.ptr - - pto.vecscope { - %all = pto.pset_b32 "PAT_ALL" : !pto.mask - %lhs, %lhs_next = pto.plt_b32 %c13 : i32 -> !pto.mask, i32 - %rhs, %rhs_next = pto.plt_b32 %c7 : i32 -> !pto.mask, i32 - %out = pto.pxor %lhs, %rhs, %all : !pto.mask, !pto.mask, !pto.mask -> !pto.mask - pto.psts %out, %ub_out[%c0], "NORM" : !pto.mask, !pto.ptr, index - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %gm_out, %c32_i64 - nburst(%c1_i64, %c32_i64, %c32_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/materialization-predicate/pxor/launch.cpp b/test/vpto/cases/micro-op/materialization-predicate/pxor/launch.cpp deleted file mode 100644 index 55f3770dc..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/pxor/launch.cpp +++ /dev/null @@ -1,48 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/materialization-predicate/pxor -// family: materialization-predicate -// target_ops: pto.pxor -// scenarios: predicate-transform -// --------------------------------------------------------------------------- -// PTOAS compatibility layer -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void pxor_kernel_2d(__gm__ uint32_t *v1); - -void LaunchPxor(uint32_t *v1, void *stream) { - pxor_kernel_2d<<<1, nullptr, stream>>>((__gm__ uint32_t *)v1); -} diff --git a/test/vpto/cases/micro-op/materialization-predicate/pxor/main.cpp b/test/vpto/cases/micro-op/materialization-predicate/pxor/main.cpp deleted file mode 100644 index 6bf82fb80..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/pxor/main.cpp +++ /dev/null @@ -1,104 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/materialization-predicate/pxor -// family: materialization-predicate -// target_ops: pto.pxor -// scenarios: predicate-transform -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -This program is free software, you can redistribute it and/or modify it under the terms and conditions of -CANN Open Software License Agreement Version 2.0 (the "License"). -Please refer to the License for details. You may not use this file except in compliance with the License. -THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -See LICENSE in the root of the software repository for the full text of the License. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include -#include - -using namespace PtoTestCommon; - -#ifndef TMRGSORT_HPP -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif - -#define ACL_CHECK(expr) do { const aclError _ret = (expr); if (_ret != ACL_SUCCESS) { std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, (int)_ret, __FILE__, __LINE__); const char *_recent = aclGetRecentErrMsg(); if (_recent != nullptr && _recent[0] != '\0') { std::fprintf(stderr, "[ERROR] RecentErrMsg: %s\n", _recent); } rc = 1; goto cleanup; } } while (0) - -void LaunchPxor(uint32_t *v1, void *stream); - -int main() { - size_t elemCount_v1 = 32; - size_t fileSize_v1 = elemCount_v1 * sizeof(uint32_t); - uint32_t *v1Host = nullptr; - uint32_t *v1Device = nullptr; - - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { - deviceId = std::atoi(envDevice); - } - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchPxor(v1Device, stream); - - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v1Host, fileSize_v1, v1Device, fileSize_v1, ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v1.bin", v1Host, fileSize_v1); - -cleanup: - aclrtFree(v1Device); - aclrtFreeHost(v1Host); - if (stream != nullptr) { - const aclError _ret = aclrtDestroyStream(stream); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtDestroyStream(stream)", (int)_ret, __FILE__, __LINE__); - } - stream = nullptr; - } - if (deviceSet) { - const aclError _ret = aclrtResetDevice(deviceId); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtResetDevice(deviceId)", (int)_ret, __FILE__, __LINE__); - } - } - if (aclInited) { - const aclError _ret = aclFinalize(); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclFinalize()", (int)_ret, __FILE__, __LINE__); - } - } - - return rc; -} diff --git a/test/vpto/cases/micro-op/materialization-predicate/vdup-scalar-f16/compare.py b/test/vpto/cases/micro-op/materialization-predicate/vdup-scalar-f16/compare.py deleted file mode 100644 index e423b7707..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/vdup-scalar-f16/compare.py +++ /dev/null @@ -1,60 +0,0 @@ -#!/usr/bin/python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# coding=utf-8 - -import os -import sys -import numpy as np - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - abs_diff = np.abs(g - o) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={g[idx]}, out={o[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_bin("golden_v1.bin", "v1.bin", np.float16, 0.001) - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/materialization-predicate/vdup-scalar-f16/golden.py b/test/vpto/cases/micro-op/materialization-predicate/vdup-scalar-f16/golden.py deleted file mode 100644 index 3f3ad08ba..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/vdup-scalar-f16/golden.py +++ /dev/null @@ -1,45 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -ROWS = 32 -COLS = 32 -SEED = 19 -VALUE = np.float16(1.25) - - -def generate(output_dir: Path, seed: int) -> None: - del seed - v1 = np.zeros((ROWS, COLS), dtype=np.float16) - golden_v1 = np.full((ROWS, COLS), VALUE, dtype=np.float16) - - output_dir.mkdir(parents=True, exist_ok=True) - v1.reshape(-1).tofile(output_dir / "v1.bin") - golden_v1.reshape(-1).tofile(output_dir / "golden_v1.bin") - - -def main() -> None: - parser = argparse.ArgumentParser( - description="Generate numpy-based inputs/golden for VPTO micro-op vdup-scalar-f16 validation." - ) - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/materialization-predicate/vdup-scalar-f16/kernel.pto b/test/vpto/cases/micro-op/materialization-predicate/vdup-scalar-f16/kernel.pto deleted file mode 100644 index 4ca7c6cd6..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/vdup-scalar-f16/kernel.pto +++ /dev/null @@ -1,36 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/materialization-predicate/vdup-scalar-f16 -// family: materialization-predicate -// target_ops: pto.vdup -// scenarios: core-f16, scalar-broadcast -// ----------------------------------------------------------------------------- -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vdup_scalar_f16_kernel_2d(%arg0: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c128 = arith.constant 128 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c128_i64 = arith.constant 128 : i64 - %cst = arith.constant 1.250000e+00 : f16 - - %ub_out = pto.castptr %c0_i64 : i64 -> !pto.ptr - - pto.vecscope { - %active = pto.pset_b16 "PAT_ALL" : !pto.mask - scf.for %offset = %c0 to %c1024 step %c128 { - %vec = pto.vdup %cst, %active : f16, !pto.mask -> !pto.vreg<128xf16> - pto.vsts %vec, %ub_out[%offset], %active : !pto.vreg<128xf16>, !pto.ptr, !pto.mask - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg0, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/materialization-predicate/vdup-scalar-f16/launch.cpp b/test/vpto/cases/micro-op/materialization-predicate/vdup-scalar-f16/launch.cpp deleted file mode 100644 index 664c961dd..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/vdup-scalar-f16/launch.cpp +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include - -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vdup_scalar_f16_kernel_2d(__gm__ half *v1); - -void LaunchVdup_scalar_f16_kernel_2d(aclFloat16 *v1, void *stream) { - vdup_scalar_f16_kernel_2d<<<1, nullptr, stream>>>((__gm__ half *)v1); -} diff --git a/test/vpto/cases/micro-op/materialization-predicate/vdup-scalar-f16/main.cpp b/test/vpto/cases/micro-op/materialization-predicate/vdup-scalar-f16/main.cpp deleted file mode 100644 index b8f469441..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/vdup-scalar-f16/main.cpp +++ /dev/null @@ -1,111 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -This program is free software, you can redistribute it and/or modify it under the terms and conditions of -CANN Open Software License Agreement Version 2.0 (the "License"). -Please refer to the License for details. You may not use this file except in compliance with the License. -THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -See LICENSE in the root of the software repository for the full text of the License. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include -#include - -using namespace PtoTestCommon; - -#ifndef TMRGSORT_HPP -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, (int)_ret, __FILE__, __LINE__); \ - const char *_recent = aclGetRecentErrMsg(); \ - if (_recent != nullptr && _recent[0] != '\0') { \ - std::fprintf(stderr, "[ERROR] RecentErrMsg: %s\n", _recent); \ - } \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVdup_scalar_f16_kernel_2d(aclFloat16 *v1, void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(aclFloat16); - aclFloat16 *v1Host = nullptr; - aclFloat16 *v1Device = nullptr; - - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { - deviceId = std::atoi(envDevice); - } - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVdup_scalar_f16_kernel_2d(v1Device, stream); - - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v1Host, fileSize_v1, v1Device, fileSize_v1, ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v1.bin", v1Host, fileSize_v1); - -cleanup: - aclrtFree(v1Device); - aclrtFreeHost(v1Host); - if (stream != nullptr) { - const aclError _ret = aclrtDestroyStream(stream); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtDestroyStream(stream)", (int)_ret, __FILE__, __LINE__); - } - stream = nullptr; - } - if (deviceSet) { - const aclError _ret = aclrtResetDevice(deviceId); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtResetDevice(deviceId)", (int)_ret, __FILE__, __LINE__); - } - } - if (aclInited) { - const aclError _ret = aclFinalize(); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclFinalize()", (int)_ret, __FILE__, __LINE__); - } - } - - return rc; -} diff --git a/test/vpto/cases/micro-op/materialization-predicate/vdup-scalar-i8/compare.py b/test/vpto/cases/micro-op/materialization-predicate/vdup-scalar-i8/compare.py deleted file mode 100644 index bc7b42e1a..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/vdup-scalar-i8/compare.py +++ /dev/null @@ -1,54 +0,0 @@ -#!/usr/bin/python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# coding=utf-8 - -import os -import sys -import numpy as np - - -def compare_bin(golden_path, output_path, dtype): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.array_equal(golden, output): - diff = np.nonzero(golden != output)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden[idx])}, out={int(output[idx])}, dtype={dtype_np})" - ) - return False - return True - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_bin("golden_v1.bin", "v1.bin", np.int8) - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/materialization-predicate/vdup-scalar-i8/golden.py b/test/vpto/cases/micro-op/materialization-predicate/vdup-scalar-i8/golden.py deleted file mode 100644 index 90e2a44bb..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/vdup-scalar-i8/golden.py +++ /dev/null @@ -1,45 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -ROWS = 32 -COLS = 32 -SEED = 19 -VALUE = np.int8(-83) - - -def generate(output_dir: Path, seed: int) -> None: - del seed - v1 = np.zeros((ROWS, COLS), dtype=np.int8) - golden_v1 = np.full((ROWS, COLS), VALUE, dtype=np.int8) - - output_dir.mkdir(parents=True, exist_ok=True) - v1.reshape(-1).tofile(output_dir / "v1.bin") - golden_v1.reshape(-1).tofile(output_dir / "golden_v1.bin") - - -def main() -> None: - parser = argparse.ArgumentParser( - description="Generate numpy-based inputs/golden for VPTO micro-op vdup-scalar-i8 validation." - ) - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/materialization-predicate/vdup-scalar-i8/kernel.pto b/test/vpto/cases/micro-op/materialization-predicate/vdup-scalar-i8/kernel.pto deleted file mode 100644 index f6cc925e4..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/vdup-scalar-i8/kernel.pto +++ /dev/null @@ -1,36 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/materialization-predicate/vdup-scalar-i8 -// family: materialization-predicate -// target_ops: pto.vdup -// scenarios: core-i8, scalar-broadcast -// ----------------------------------------------------------------------------- -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vdup_scalar_i8_kernel_2d(%arg0: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c128 = arith.constant 128 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c128_i64 = arith.constant 128 : i64 - %cst = arith.constant -83 : i8 - - %ub_out = pto.castptr %c0_i64 : i64 -> !pto.ptr - - pto.vecscope { - %active = pto.pset_b8 "PAT_ALL" : !pto.mask - scf.for %offset = %c0 to %c1024 step %c128 { - %vec = pto.vdup %cst, %active : i8, !pto.mask -> !pto.vreg<256xsi8> - pto.vsts %vec, %ub_out[%offset], %active : !pto.vreg<256xsi8>, !pto.ptr, !pto.mask - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg0, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/materialization-predicate/vdup-scalar-i8/launch.cpp b/test/vpto/cases/micro-op/materialization-predicate/vdup-scalar-i8/launch.cpp deleted file mode 100644 index 5054b1777..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/vdup-scalar-i8/launch.cpp +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include - -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vdup_scalar_i8_kernel_2d(__gm__ int8_t *v1); - -void LaunchVdup_scalar_i8_kernel_2d(int8_t *v1, void *stream) { - vdup_scalar_i8_kernel_2d<<<1, nullptr, stream>>>((__gm__ int8_t *)v1); -} diff --git a/test/vpto/cases/micro-op/materialization-predicate/vdup-scalar-i8/main.cpp b/test/vpto/cases/micro-op/materialization-predicate/vdup-scalar-i8/main.cpp deleted file mode 100644 index fde52caa7..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/vdup-scalar-i8/main.cpp +++ /dev/null @@ -1,111 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -This program is free software, you can redistribute it and/or modify it under the terms and conditions of -CANN Open Software License Agreement Version 2.0 (the "License"). -Please refer to the License for details. You may not use this file except in compliance with the License. -THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -See LICENSE in the root of the software repository for the full text of the License. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include -#include - -using namespace PtoTestCommon; - -#ifndef TMRGSORT_HPP -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, (int)_ret, __FILE__, __LINE__); \ - const char *_recent = aclGetRecentErrMsg(); \ - if (_recent != nullptr && _recent[0] != '\0') { \ - std::fprintf(stderr, "[ERROR] RecentErrMsg: %s\n", _recent); \ - } \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVdup_scalar_i8_kernel_2d(int8_t *v1, void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(int8_t); - int8_t *v1Host = nullptr; - int8_t *v1Device = nullptr; - - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { - deviceId = std::atoi(envDevice); - } - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVdup_scalar_i8_kernel_2d(v1Device, stream); - - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v1Host, fileSize_v1, v1Device, fileSize_v1, ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v1.bin", v1Host, fileSize_v1); - -cleanup: - aclrtFree(v1Device); - aclrtFreeHost(v1Host); - if (stream != nullptr) { - const aclError _ret = aclrtDestroyStream(stream); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtDestroyStream(stream)", (int)_ret, __FILE__, __LINE__); - } - stream = nullptr; - } - if (deviceSet) { - const aclError _ret = aclrtResetDevice(deviceId); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtResetDevice(deviceId)", (int)_ret, __FILE__, __LINE__); - } - } - if (aclInited) { - const aclError _ret = aclFinalize(); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclFinalize()", (int)_ret, __FILE__, __LINE__); - } - } - - return rc; -} diff --git a/test/vpto/cases/micro-op/materialization-predicate/vdup-scalar-u8/compare.py b/test/vpto/cases/micro-op/materialization-predicate/vdup-scalar-u8/compare.py deleted file mode 100644 index 54831ce84..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/vdup-scalar-u8/compare.py +++ /dev/null @@ -1,54 +0,0 @@ -#!/usr/bin/python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# coding=utf-8 - -import os -import sys -import numpy as np - - -def compare_bin(golden_path, output_path, dtype): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.array_equal(golden, output): - diff = np.nonzero(golden != output)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden[idx])}, out={int(output[idx])}, dtype={dtype_np})" - ) - return False - return True - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_bin("golden_v1.bin", "v1.bin", np.uint8) - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/materialization-predicate/vdup-scalar-u8/golden.py b/test/vpto/cases/micro-op/materialization-predicate/vdup-scalar-u8/golden.py deleted file mode 100644 index 20eed0a35..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/vdup-scalar-u8/golden.py +++ /dev/null @@ -1,45 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -ROWS = 32 -COLS = 32 -SEED = 19 -VALUE = np.uint8(173) - - -def generate(output_dir: Path, seed: int) -> None: - del seed - v1 = np.zeros((ROWS, COLS), dtype=np.uint8) - golden_v1 = np.full((ROWS, COLS), VALUE, dtype=np.uint8) - - output_dir.mkdir(parents=True, exist_ok=True) - v1.reshape(-1).tofile(output_dir / "v1.bin") - golden_v1.reshape(-1).tofile(output_dir / "golden_v1.bin") - - -def main() -> None: - parser = argparse.ArgumentParser( - description="Generate numpy-based inputs/golden for VPTO micro-op vdup-scalar-u8 validation." - ) - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/materialization-predicate/vdup-scalar-u8/kernel.pto b/test/vpto/cases/micro-op/materialization-predicate/vdup-scalar-u8/kernel.pto deleted file mode 100644 index 14e74bb52..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/vdup-scalar-u8/kernel.pto +++ /dev/null @@ -1,36 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/materialization-predicate/vdup-scalar-u8 -// family: materialization-predicate -// target_ops: pto.vdup -// scenarios: core-u8, scalar-broadcast-signless -// ----------------------------------------------------------------------------- -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vdup_scalar_u8_kernel_2d(%arg0: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c128 = arith.constant 128 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c128_i64 = arith.constant 128 : i64 - %cst = arith.constant -83 : i8 - - %ub_out = pto.castptr %c0_i64 : i64 -> !pto.ptr - - pto.vecscope { - %active = pto.pset_b8 "PAT_ALL" : !pto.mask - scf.for %offset = %c0 to %c1024 step %c128 { - %vec = pto.vdup %cst, %active : i8, !pto.mask -> !pto.vreg<256xui8> - pto.vsts %vec, %ub_out[%offset], %active : !pto.vreg<256xui8>, !pto.ptr, !pto.mask - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg0, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/materialization-predicate/vdup-scalar-u8/launch.cpp b/test/vpto/cases/micro-op/materialization-predicate/vdup-scalar-u8/launch.cpp deleted file mode 100644 index d25c635c9..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/vdup-scalar-u8/launch.cpp +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include - -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vdup_scalar_u8_kernel_2d(__gm__ uint8_t *v1); - -void LaunchVdup_scalar_u8_kernel_2d(uint8_t *v1, void *stream) { - vdup_scalar_u8_kernel_2d<<<1, nullptr, stream>>>((__gm__ uint8_t *)v1); -} diff --git a/test/vpto/cases/micro-op/materialization-predicate/vdup-scalar-u8/main.cpp b/test/vpto/cases/micro-op/materialization-predicate/vdup-scalar-u8/main.cpp deleted file mode 100644 index 32dfff59a..000000000 --- a/test/vpto/cases/micro-op/materialization-predicate/vdup-scalar-u8/main.cpp +++ /dev/null @@ -1,101 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -#include "test_common.h" -#include "acl/acl.h" -#include -#include -#include - -using namespace PtoTestCommon; - -#ifndef TMRGSORT_HPP -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, (int)_ret, __FILE__, __LINE__); \ - const char *_recent = aclGetRecentErrMsg(); \ - if (_recent != nullptr && _recent[0] != '\0') { \ - std::fprintf(stderr, "[ERROR] RecentErrMsg: %s\n", _recent); \ - } \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVdup_scalar_u8_kernel_2d(uint8_t *v1, void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(uint8_t); - uint8_t *v1Host = nullptr; - uint8_t *v1Device = nullptr; - - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { - deviceId = std::atoi(envDevice); - } - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVdup_scalar_u8_kernel_2d(v1Device, stream); - - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v1Host, fileSize_v1, v1Device, fileSize_v1, ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v1.bin", v1Host, fileSize_v1); - -cleanup: - aclrtFree(v1Device); - aclrtFreeHost(v1Host); - if (stream != nullptr) { - const aclError _ret = aclrtDestroyStream(stream); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtDestroyStream(stream)", (int)_ret, __FILE__, __LINE__); - } - stream = nullptr; - } - if (deviceSet) { - const aclError _ret = aclrtResetDevice(deviceId); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtResetDevice(deviceId)", (int)_ret, __FILE__, __LINE__); - } - } - if (aclInited) { - const aclError _ret = aclFinalize(); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclFinalize()", (int)_ret, __FILE__, __LINE__); - } - } - - return rc; -} diff --git a/test/vpto/cases/micro-op/materialization-predicate/vdup-scalar/kernel.pto b/test/vpto/cases/micro-op/materialization-predicate/vdup-scalar/kernel.pto index de419713b..d9d0470ea 100644 --- a/test/vpto/cases/micro-op/materialization-predicate/vdup-scalar/kernel.pto +++ b/test/vpto/cases/micro-op/materialization-predicate/vdup-scalar/kernel.pto @@ -1,49 +1,124 @@ -// ============================================================================= -// abs_kernel_2d: Element-wise absolute value on a 32x32 f32 tile -// ============================================================================= -// This kernel computes abs(input) for a 32x32 float32 matrix (1024 elements). -// -// Memory Layout: -// - Input: arg0 -> GM (Global Memory) -// - Output: arg1 -> GM (Global Memory) -// - UB (Unified Buffer) at offset 0: input tile (4096 bytes = 32*32*4) -// - UB at offset 4096: output tile (4096 bytes = 32*32*4) -// -// Pipeline: -// 1. DMA: GM -> UB (MTE2 pipe) - copy input tile to UB -// 2. Sync: wait for MTE2 -> V pipe handoff -// 3. Compute: vabs on 64-element vectors (V pipe) - 16 iterations for 1024 elements -// 4. Sync: wait for V -> MTE3 pipe handoff -// 5. DMA: UB -> GM (MTE3 pipe) - copy result tile back to GM -// ============================================================================= - module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vdup_scalar_kernel_2d(%arg0: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c64 = arith.constant 64 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c128_i64 = arith.constant 128 : i64 - %cst = arith.constant -2.500000e+00 : f32 - - %ub_out = pto.castptr %c0_i64 : i64 -> !pto.ptr + func.func @vdup_scalar_deep_merged_kernel(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: !pto.ptr) attributes {pto.kernel} { + %__deep_merge_guard = arith.constant false + // inactive merged from vdup_scalar_f16_kernel_2d + scf.if %__deep_merge_guard { + + %c0_m0 = arith.constant 0 : index + %c128_m0 = arith.constant 128 : index + %c1024_m0 = arith.constant 1024 : index + %c0_i64_m0 = arith.constant 0 : i64 + %c1_i64_m0 = arith.constant 1 : i64 + %c32_i64_m0 = arith.constant 32 : i64 + %c128_i64_m0 = arith.constant 128 : i64 + %cst_m0 = arith.constant 1.250000e+00 : f16 + + %ub_out_m0 = pto.castptr %c0_i64_m0 : i64 -> !pto.ptr + + pto.vecscope { + %active_m0 = pto.pset_b16 "PAT_ALL" : !pto.mask + scf.for %offset_m0 = %c0_m0 to %c1024_m0 step %c128_m0 { + %vec_m0 = pto.vdup %cst_m0, %active_m0 : f16, !pto.mask -> !pto.vreg<128xf16> + pto.vsts %vec_m0, %ub_out_m0[%offset_m0], %active_m0 : !pto.vreg<128xf16>, !pto.ptr, !pto.mask + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m0, %arg0, %c128_i64_m0 + nburst(%c32_i64_m0, %c128_i64_m0, %c128_i64_m0) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + + } + // inactive merged from vdup_scalar_i8_kernel_2d + scf.if %__deep_merge_guard { + + %c0_m1 = arith.constant 0 : index + %c128_m1 = arith.constant 128 : index + %c1024_m1 = arith.constant 1024 : index + %c0_i64_m1 = arith.constant 0 : i64 + %c1_i64_m1 = arith.constant 1 : i64 + %c32_i64_m1 = arith.constant 32 : i64 + %c128_i64_m1 = arith.constant 128 : i64 + %cst_m1 = arith.constant -83 : i8 + + %ub_out_m1 = pto.castptr %c0_i64_m1 : i64 -> !pto.ptr + + pto.vecscope { + %active_m1 = pto.pset_b8 "PAT_ALL" : !pto.mask + scf.for %offset_m1 = %c0_m1 to %c1024_m1 step %c128_m1 { + %vec_m1 = pto.vdup %cst_m1, %active_m1 : i8, !pto.mask -> !pto.vreg<256xsi8> + pto.vsts %vec_m1, %ub_out_m1[%offset_m1], %active_m1 : !pto.vreg<256xsi8>, !pto.ptr, !pto.mask + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m1, %arg1, %c128_i64_m1 + nburst(%c32_i64_m1, %c128_i64_m1, %c128_i64_m1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + + } + // inactive merged from vdup_scalar_u8_kernel_2d + scf.if %__deep_merge_guard { + + %c0_m2 = arith.constant 0 : index + %c128_m2 = arith.constant 128 : index + %c1024_m2 = arith.constant 1024 : index + %c0_i64_m2 = arith.constant 0 : i64 + %c1_i64_m2 = arith.constant 1 : i64 + %c32_i64_m2 = arith.constant 32 : i64 + %c128_i64_m2 = arith.constant 128 : i64 + %cst_m2 = arith.constant -83 : i8 + + %ub_out_m2 = pto.castptr %c0_i64_m2 : i64 -> !pto.ptr + + pto.vecscope { + %active_m2 = pto.pset_b8 "PAT_ALL" : !pto.mask + scf.for %offset_m2 = %c0_m2 to %c1024_m2 step %c128_m2 { + %vec_m2 = pto.vdup %cst_m2, %active_m2 : i8, !pto.mask -> !pto.vreg<256xui8> + pto.vsts %vec_m2, %ub_out_m2[%offset_m2], %active_m2 : !pto.vreg<256xui8>, !pto.ptr, !pto.mask + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m2, %arg2, %c128_i64_m2 + nburst(%c32_i64_m2, %c128_i64_m2, %c128_i64_m2) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + + } + // active merged from vdup_scalar_kernel_2d + + %c0_m3 = arith.constant 0 : index + %c64_m3 = arith.constant 64 : index + %c1024_m3 = arith.constant 1024 : index + %c0_i64_m3 = arith.constant 0 : i64 + %c1_i64_m3 = arith.constant 1 : i64 + %c32_i64_m3 = arith.constant 32 : i64 + %c128_i64_m3 = arith.constant 128 : i64 + %cst_m3 = arith.constant -2.500000e+00 : f32 + + %ub_out_m3 = pto.castptr %c0_i64_m3 : i64 -> !pto.ptr pto.vecscope { - %active = pto.pset_b32 "PAT_ALL" : !pto.mask - scf.for %offset = %c0 to %c1024 step %c64 { - %vec = pto.vdup %cst, %active : f32, !pto.mask -> !pto.vreg<64xf32> - pto.vsts %vec, %ub_out[%offset], %active : !pto.vreg<64xf32>, !pto.ptr, !pto.mask + %active_m3 = pto.pset_b32 "PAT_ALL" : !pto.mask + scf.for %offset_m3 = %c0_m3 to %c1024_m3 step %c64_m3 { + %vec_m3 = pto.vdup %cst_m3, %active_m3 : f32, !pto.mask -> !pto.vreg<64xf32> + pto.vsts %vec_m3, %ub_out_m3[%offset_m3], %active_m3 : !pto.vreg<64xf32>, !pto.ptr, !pto.mask } } pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg0, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) + pto.mte_ub_gm %ub_out_m3, %arg3, %c128_i64_m3 + nburst(%c32_i64_m3, %c128_i64_m3, %c128_i64_m3) : !pto.ptr, !pto.ptr, i64, i64, i64, i64 pto.barrier #pto.pipe + return } } diff --git a/test/vpto/cases/micro-op/materialization-predicate/vdup-scalar/launch.cpp b/test/vpto/cases/micro-op/materialization-predicate/vdup-scalar/launch.cpp index 02754e6d2..7e196793a 100644 --- a/test/vpto/cases/micro-op/materialization-predicate/vdup-scalar/launch.cpp +++ b/test/vpto/cases/micro-op/materialization-predicate/vdup-scalar/launch.cpp @@ -5,22 +5,9 @@ // THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, // INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. // See LICENSE in the root of the software repository for the full text of the License. - -// --------------------------------------------------------------------------- -// PTOAS compatibility layer -// -// The upstream pto-isa headers reference some FP8/FP4 types and the -// __VEC_SCOPE__ marker that are not available on every AICore arch/toolchain -// combination (e.g. __NPU_ARCH__==2201). -// -// For our PTOAS-generated kernels we don't rely on these types today, but the -// headers still mention them in templates/static_asserts. Provide minimal -// fallbacks to keep compilation working on dav-c220. -// --------------------------------------------------------------------------- #ifndef __VEC_SCOPE__ #define __VEC_SCOPE__ #endif - #if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) typedef struct { unsigned char v; } hifloat8_t; typedef struct { unsigned char v; } float8_e4m3_t; @@ -30,32 +17,26 @@ typedef struct { unsigned char v; } float4_e1m2x2_t; typedef struct { unsigned char v; } float4_e2m1x2_t; #endif #include - -// AICore printf support is gated behind `--cce-enable-print` on some -// toolchains. When enabled, include the CCE print header so `cce::printf` -// resolves in device compilation. #if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) #include #endif - -// Some PTO-ISA types are only available in the __CCE_AICORE__ compilation -// path, but `bisheng -xcce` still performs a host-side parse pass. -// Provide minimal fallbacks only when the corresponding header wasn't -// pulled in by the selected arch implementation. #if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; +struct MrgSortExecutedNumList { uint16_t mrgSortList0,mrgSortList1,mrgSortList2,mrgSortList3; }; #endif #ifndef __CPU_SIM #include "acl/acl.h" #endif -extern "C" __global__ [aicore] void vdup_scalar_kernel_2d(__gm__ float *v1); +extern "C" __global__ [aicore] void vdup_scalar_deep_merged_kernel( + __gm__ half * arg0, + __gm__ int8_t * arg1, + __gm__ uint8_t * arg2, + __gm__ float * arg3); -void LaunchVdup_scalar_kernel_2d(float *v1, void *stream) { - vdup_scalar_kernel_2d<<<1, nullptr, stream>>>((__gm__ float *)v1); +void LaunchVdupScalarDeepMerged(float * p0, void *stream) { + vdup_scalar_deep_merged_kernel<<<1, nullptr, stream>>>( + (__gm__ half *)p0, + (__gm__ int8_t *)p0, + (__gm__ uint8_t *)p0, + (__gm__ float *)p0); } diff --git a/test/vpto/cases/micro-op/materialization-predicate/vdup-scalar/main.cpp b/test/vpto/cases/micro-op/materialization-predicate/vdup-scalar/main.cpp index 6aff66657..1fe52dac3 100644 --- a/test/vpto/cases/micro-op/materialization-predicate/vdup-scalar/main.cpp +++ b/test/vpto/cases/micro-op/materialization-predicate/vdup-scalar/main.cpp @@ -47,8 +47,8 @@ struct MrgSortExecutedNumList { } \ } while (0) -void LaunchVdup_scalar_kernel_2d(float *v1, void *stream); +void LaunchVdupScalarDeepMerged(float * p0, void *stream); int main() { size_t elemCount_v1 = 1024; size_t fileSize_v1 = elemCount_v1 * sizeof(float); @@ -75,7 +75,7 @@ int main() { ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVdup_scalar_kernel_2d(v1Device, stream); + LaunchVdupScalarDeepMerged(v1Device, stream); ACL_CHECK(aclrtSynchronizeStream(stream)); ACL_CHECK(aclrtMemcpy(v1Host, fileSize_v1, v1Device, fileSize_v1, ACL_MEMCPY_DEVICE_TO_HOST)); diff --git a/test/vpto/cases/micro-op/predicate-load-store/pldi-norm/kernel.pto b/test/vpto/cases/micro-op/predicate-load-store/pldi-norm/kernel.pto index 9b16dd30f..7d8262093 100644 --- a/test/vpto/cases/micro-op/predicate-load-store/pldi-norm/kernel.pto +++ b/test/vpto/cases/micro-op/predicate-load-store/pldi-norm/kernel.pto @@ -54,6 +54,59 @@ module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind, !pto.ptr, i64, i64, i64, i64 pto.barrier #pto.pipe + %__case_merge_guard = arith.constant false + + // inactive cross-case merge from micro-op/predicate-load-store/plds-norm + scf.if %__case_merge_guard { + + %c0_cmg9_1 = arith.constant 0 : index + %c1_cmg9_1 = arith.constant 1 : index + %c256_cmg9_1 = arith.constant 256 : index + %c32_cmg9_1 = arith.constant 32 : index + %c288_cmg9_1 = arith.constant 288 : index + %c0_i64_cmg9_1 = arith.constant 0 : i64 + %c1_i64_cmg9_1 = arith.constant 1 : i64 + %c4_i64_cmg9_1 = arith.constant 4 : i64 + %c256_i64_cmg9_1 = arith.constant 256 : i64 + %c256_i32_cmg9_1 = arith.constant 256 : i32 + %c256_loop_i32_cmg9_1 = arith.constant 256 : i32 + %c8192_i64_cmg9_1 = arith.constant 8192 : i64 + %c10240_i64_cmg9_1 = arith.constant 10240 : i64 + %false_cmg9_1 = arith.constant false + + %ub_in_cmg9_1 = pto.castptr %c8192_i64_cmg9_1 : i64 -> !pto.ptr + %ub_out_cmg9_1 = pto.castptr %c10240_i64_cmg9_1 : i64 -> !pto.ptr + pto.mte_gm_ub %arg0, %ub_in_cmg9_1, %c0_i64_cmg9_1, %c256_i64_cmg9_1 + nburst(%c4_i64_cmg9_1, %c256_i64_cmg9_1, %c256_i64_cmg9_1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %__cmg9_1:1 = scf.for %offset_cmg9_1 = %c0_cmg9_1 to %c256_cmg9_1 step %c256_cmg9_1 iter_args(%remaining_cmg9_1 = %c256_loop_i32_cmg9_1) -> (i32) { + %byte_offset_cmg9_1 = arith.addi %offset_cmg9_1, %c0_cmg9_1 : index + %loaded_cmg9_1 = pto.plds %ub_in_cmg9_1[%byte_offset_cmg9_1], "NORM" : !pto.ptr, index -> !pto.mask + %full_mask_cmg9_1, %next_remaining_cmg9_1 = pto.plt_b8 %remaining_cmg9_1 : i32 -> !pto.mask, i32 + %ones_offset_cmg9_1 = arith.addi %offset_cmg9_1, %c32_cmg9_1 : index + %zeros_offset_cmg9_1 = arith.addi %offset_cmg9_1, %c288_cmg9_1 : index + %ones_cmg9_1 = pto.vlds %ub_in_cmg9_1[%ones_offset_cmg9_1] : !pto.ptr -> !pto.vreg<256xui8> + %zeros_cmg9_1 = pto.vlds %ub_in_cmg9_1[%zeros_offset_cmg9_1] : !pto.ptr -> !pto.vreg<256xui8> + %out_cmg9_1 = pto.vsel %ones_cmg9_1, %zeros_cmg9_1, %loaded_cmg9_1 : !pto.vreg<256xui8>, !pto.vreg<256xui8>, !pto.mask -> !pto.vreg<256xui8> + pto.vsts %out_cmg9_1, %ub_out_cmg9_1[%offset_cmg9_1], %full_mask_cmg9_1 : !pto.vreg<256xui8>, !pto.ptr, !pto.mask + scf.yield %next_remaining_cmg9_1 : i32 + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.set_loop1_stride_ubtoout %c256_i64_cmg9_1, %c256_i64_cmg9_1 : i64, i64 + pto.set_loop2_stride_ubtoout %c256_i64_cmg9_1, %c256_i64_cmg9_1 : i64, i64 + pto.mte_ub_gm %ub_out_cmg9_1, %arg1, %c256_i64_cmg9_1 + nburst(%c4_i64_cmg9_1, %c256_i64_cmg9_1, %c256_i64_cmg9_1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + } return } } diff --git a/test/vpto/cases/micro-op/predicate-load-store/plds-norm/compare.py b/test/vpto/cases/micro-op/predicate-load-store/plds-norm/compare.py deleted file mode 100644 index bd3820b2e..000000000 --- a/test/vpto/cases/micro-op/predicate-load-store/plds-norm/compare.py +++ /dev/null @@ -1,38 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/predicate-load-store/plds-norm -# family: predicate-load-store -# target_ops: pto.plds -# scenarios: packed-load, dynamic-offset, representative-logical-elements - -import numpy as np - - -def main() -> None: - golden = np.fromfile("golden_v2.bin", dtype=np.uint8) - output = np.fromfile("v2.bin", dtype=np.uint8) - if golden.size < 256 or output.size < 256: - print( - f"[ERROR] Packed buffer too small: golden={golden.size} out={output.size}" - ) - raise SystemExit(2) - if not np.array_equal(golden[:256], output[:256]): - diff = np.nonzero(golden[:256] != output[:256])[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (plds NORM -> vsel): idx={idx} " - f"golden={int(golden[idx])} out={int(output[idx])}" - ) - raise SystemExit(2) - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/predicate-load-store/plds-norm/golden.py b/test/vpto/cases/micro-op/predicate-load-store/plds-norm/golden.py deleted file mode 100644 index e6cf2fb1b..000000000 --- a/test/vpto/cases/micro-op/predicate-load-store/plds-norm/golden.py +++ /dev/null @@ -1,74 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/predicate-load-store/plds-norm -# family: predicate-load-store -# target_ops: pto.plds -# scenarios: packed-load, dynamic-offset, representative-logical-elements - -import argparse -from pathlib import Path - -import numpy as np - - -SEED = 19 -ACTIVE_BITS = 145 -OUTPUT_BYTES = 1024 -VECTOR_BYTES = 256 -PACKED_BYTES = 32 - - -def prefix_bits(active_bits: int) -> np.ndarray: - bits = np.zeros((256,), dtype=np.uint8) - bits[:active_bits] = 1 - return bits - - -def make_input_buffer(bits: np.ndarray) -> np.ndarray: - packed = np.packbits(bits.astype(np.uint8, copy=False), bitorder="little") - ones = np.ones((VECTOR_BYTES,), dtype=np.uint8) - zeros = np.zeros((VECTOR_BYTES,), dtype=np.uint8) - out = np.zeros((OUTPUT_BYTES,), dtype=np.uint8) - out[:PACKED_BYTES] = packed[:PACKED_BYTES] - out[PACKED_BYTES : PACKED_BYTES + VECTOR_BYTES] = ones - out[PACKED_BYTES + VECTOR_BYTES : PACKED_BYTES + 2 * VECTOR_BYTES] = zeros - return out - - -def expected_selected_bytes(bits: np.ndarray) -> np.ndarray: - out = np.zeros((OUTPUT_BYTES,), dtype=np.uint8) - out[:VECTOR_BYTES] = bits.astype(np.uint8, copy=False) - return out - - -def generate(output_dir: Path, seed: int) -> None: - del seed - bits = prefix_bits(ACTIVE_BITS) - input_buffer = make_input_buffer(bits) - golden = expected_selected_bytes(bits) - - output_dir.mkdir(parents=True, exist_ok=True) - input_buffer.tofile(output_dir / "v1.bin") - np.zeros((OUTPUT_BYTES,), dtype=np.uint8).tofile(output_dir / "v2.bin") - golden.tofile(output_dir / "golden_v2.bin") - - -def main() -> None: - parser = argparse.ArgumentParser( - description="Generate raw packed predicate input/golden for VPTO micro-op plds-norm validation." - ) - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/predicate-load-store/plds-norm/kernel.pto b/test/vpto/cases/micro-op/predicate-load-store/plds-norm/kernel.pto deleted file mode 100644 index d64654b22..000000000 --- a/test/vpto/cases/micro-op/predicate-load-store/plds-norm/kernel.pto +++ /dev/null @@ -1,58 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/predicate-load-store/plds-norm -// family: predicate-load-store -// target_ops: pto.plds -// scenarios: packed-load, dynamic-offset, representative-logical-elements -// ----------------------------------------------------------------------------- -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @plds_norm_kernel_2d(%arg0: !pto.ptr, %arg1: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c256 = arith.constant 256 : index - %c32 = arith.constant 32 : index - %c288 = arith.constant 288 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c4_i64 = arith.constant 4 : i64 - %c256_i64 = arith.constant 256 : i64 - %c256_i32 = arith.constant 256 : i32 - %c256_loop_i32 = arith.constant 256 : i32 - %c8192_i64 = arith.constant 8192 : i64 - %c10240_i64 = arith.constant 10240 : i64 - %false = arith.constant false - - %ub_in = pto.castptr %c8192_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c10240_i64 : i64 -> !pto.ptr - pto.mte_gm_ub %arg0, %ub_in, %c0_i64, %c256_i64 - nburst(%c4_i64, %c256_i64, %c256_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %_:1 = scf.for %offset = %c0 to %c256 step %c256 iter_args(%remaining = %c256_loop_i32) -> (i32) { - %byte_offset = arith.addi %offset, %c0 : index - %loaded = pto.plds %ub_in[%byte_offset], "NORM" : !pto.ptr, index -> !pto.mask - %full_mask, %next_remaining = pto.plt_b8 %remaining : i32 -> !pto.mask, i32 - %ones_offset = arith.addi %offset, %c32 : index - %zeros_offset = arith.addi %offset, %c288 : index - %ones = pto.vlds %ub_in[%ones_offset] : !pto.ptr -> !pto.vreg<256xui8> - %zeros = pto.vlds %ub_in[%zeros_offset] : !pto.ptr -> !pto.vreg<256xui8> - %out = pto.vsel %ones, %zeros, %loaded : !pto.vreg<256xui8>, !pto.vreg<256xui8>, !pto.mask -> !pto.vreg<256xui8> - pto.vsts %out, %ub_out[%offset], %full_mask : !pto.vreg<256xui8>, !pto.ptr, !pto.mask - scf.yield %next_remaining : i32 - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.set_loop1_stride_ubtoout %c256_i64, %c256_i64 : i64, i64 - pto.set_loop2_stride_ubtoout %c256_i64, %c256_i64 : i64, i64 - pto.mte_ub_gm %ub_out, %arg1, %c256_i64 - nburst(%c4_i64, %c256_i64, %c256_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/predicate-load-store/plds-norm/launch.cpp b/test/vpto/cases/micro-op/predicate-load-store/plds-norm/launch.cpp deleted file mode 100644 index 9a1e9de07..000000000 --- a/test/vpto/cases/micro-op/predicate-load-store/plds-norm/launch.cpp +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include - -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void plds_norm_kernel_2d(__gm__ unsigned char *v1, - __gm__ unsigned char *v2); - -void LaunchPlds_norm_kernel_2d(unsigned char *v1, unsigned char *v2, void *stream) { - plds_norm_kernel_2d<<<1, nullptr, stream>>>((__gm__ unsigned char *)v1, - (__gm__ unsigned char *)v2); -} diff --git a/test/vpto/cases/micro-op/predicate-load-store/plds-norm/main.cpp b/test/vpto/cases/micro-op/predicate-load-store/plds-norm/main.cpp deleted file mode 100644 index 30f136c67..000000000 --- a/test/vpto/cases/micro-op/predicate-load-store/plds-norm/main.cpp +++ /dev/null @@ -1,85 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/predicate-load-store/plds-norm -// ----------------------------------------------------------------------------- -#include "test_common.h" -#include "acl/acl.h" -#include -#include - -using namespace PtoTestCommon; - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, \ - (int)_ret, __FILE__, __LINE__); \ - const char *_recent = aclGetRecentErrMsg(); \ - if (_recent != nullptr && _recent[0] != '\0') \ - std::fprintf(stderr, "[ERROR] RecentErrMsg: %s\n", _recent); \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchPlds_norm_kernel_2d(unsigned char *v1, unsigned char *v2, void *stream); - -int main() { - size_t fileSize_v1 = 1024 * sizeof(unsigned char); - size_t fileSize_v2 = 1024 * sizeof(unsigned char); - unsigned char *v1Host = nullptr; - unsigned char *v1Device = nullptr; - unsigned char *v2Host = nullptr; - unsigned char *v2Device = nullptr; - - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) - deviceId = std::atoi(envDevice); - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, ACL_MEMCPY_HOST_TO_DEVICE)); - - LaunchPlds_norm_kernel_2d(v1Device, v2Device, stream); - - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v2Host, fileSize_v2, v2Device, fileSize_v2, ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v2.bin", v2Host, fileSize_v2); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - if (stream != nullptr) - aclrtDestroyStream(stream); - if (deviceSet) - aclrtResetDevice(deviceId); - if (aclInited) - aclFinalize(); - return rc; -} diff --git a/test/vpto/cases/micro-op/predicate-load-store/psti-norm-pldi-ds/kernel.pto b/test/vpto/cases/micro-op/predicate-load-store/psti-norm-pldi-ds/kernel.pto index e09d6999b..0d6f010b1 100644 --- a/test/vpto/cases/micro-op/predicate-load-store/psti-norm-pldi-ds/kernel.pto +++ b/test/vpto/cases/micro-op/predicate-load-store/psti-norm-pldi-ds/kernel.pto @@ -46,6 +46,56 @@ module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind, !pto.ptr, i64, i64, i64, i64 pto.barrier #pto.pipe + %__case_merge_guard = arith.constant false + + // inactive cross-case merge from micro-op/predicate-load-store/psts-norm-plds-ds + scf.if %__case_merge_guard { + + %c0_cmg10_1 = arith.constant 0 : index + %c1_cmg10_1 = arith.constant 1 : index + %c32_cmg10_1 = arith.constant 32 : index + %c64_cmg10_1 = arith.constant 64 : index + %c175_cmg10_1 = arith.constant 175 : i32 + %c0_i32_cmg10_1 = arith.constant 0 : i32 + %c0_i64_cmg10_1 = arith.constant 0 : i64 + %c1_i64_cmg10_1 = arith.constant 1 : i64 + %c32_i64_cmg10_1 = arith.constant 32 : i64 + %c8192_i64_cmg10_1 = arith.constant 8192 : i64 + %c10240_i64_cmg10_1 = arith.constant 10240 : i64 + %false_cmg10_1 = arith.constant false + + %ub_mid_cmg10_1 = pto.castptr %c8192_i64_cmg10_1 : i64 -> !pto.ptr + %ub_out_cmg10_1 = pto.castptr %c10240_i64_cmg10_1 : i64 -> !pto.ptr + pto.mte_gm_ub %arg1, %ub_mid_cmg10_1, %c0_i64_cmg10_1, %c32_i64_cmg10_1 + nburst(%c32_i64_cmg10_1, %c32_i64_cmg10_1, %c32_i64_cmg10_1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %__cmg10_1:1 = scf.for %iv_cmg10_1 = %c0_cmg10_1 to %c1_cmg10_1 step %c1_cmg10_1 iter_args(%remaining_cmg10_1 = %c175_cmg10_1) -> (i32) { + %byte_offset_cmg10_1 = arith.addi %iv_cmg10_1, %c32_cmg10_1 : index + %src_cmg10_1, %next_cmg10_1 = pto.plt_b8 %remaining_cmg10_1 : i32 -> !pto.mask, i32 + %zero_cmg10_1, %_unused_cmg10_1 = pto.plt_b8 %c0_i32_cmg10_1 : i32 -> !pto.mask, i32 + pto.psts %src_cmg10_1, %ub_mid_cmg10_1[%byte_offset_cmg10_1], "NORM" : !pto.mask, !pto.ptr, index + pto.psts %zero_cmg10_1, %ub_mid_cmg10_1[%c64_cmg10_1], "NORM" : !pto.mask, !pto.ptr, index + pto.mem_bar "VST_VLD" + %loaded_cmg10_1 = pto.plds %ub_mid_cmg10_1[%byte_offset_cmg10_1], "DS" : !pto.ptr, index -> !pto.mask + pto.psts %loaded_cmg10_1, %ub_out_cmg10_1[%c0_cmg10_1], "NORM" : !pto.mask, !pto.ptr, index + scf.yield %next_cmg10_1 : i32 + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.set_loop1_stride_ubtoout %c32_i64_cmg10_1, %c32_i64_cmg10_1 : i64, i64 + pto.set_loop2_stride_ubtoout %c32_i64_cmg10_1, %c32_i64_cmg10_1 : i64, i64 + pto.mte_ub_gm %ub_out_cmg10_1, %arg2, %c32_i64_cmg10_1 + nburst(%c32_i64_cmg10_1, %c32_i64_cmg10_1, %c32_i64_cmg10_1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + } return } } diff --git a/test/vpto/cases/micro-op/predicate-load-store/psti-pk-pldi-us/compare.py b/test/vpto/cases/micro-op/predicate-load-store/psti-pk-pldi-us/compare.py deleted file mode 100644 index 5adbcb96c..000000000 --- a/test/vpto/cases/micro-op/predicate-load-store/psti-pk-pldi-us/compare.py +++ /dev/null @@ -1,37 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/predicate-load-store/psti-pk-pldi-us -# family: predicate-load-store -# target_ops: pto.pldi, pto.psti -# scenarios: predicate-load-store-composition, immediate-offset, load-store-pair-preservation, representative-logical-elements - -import os -import sys -from pathlib import Path - -sys.path.append(str(Path(__file__).resolve().parent.parent)) - -from _predicate_load_store_case import compare_norm_store - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_norm_store("golden_v3.bin", "v3.bin") - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/predicate-load-store/psti-pk-pldi-us/golden.py b/test/vpto/cases/micro-op/predicate-load-store/psti-pk-pldi-us/golden.py deleted file mode 100644 index eb6a105ed..000000000 --- a/test/vpto/cases/micro-op/predicate-load-store/psti-pk-pldi-us/golden.py +++ /dev/null @@ -1,54 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/predicate-load-store/psti-pk-pldi-us -# family: predicate-load-store -# target_ops: pto.pldi, pto.psti -# scenarios: predicate-load-store-composition, immediate-offset, load-store-pair-preservation, representative-logical-elements - -import argparse -from pathlib import Path -import sys - -sys.path.append(str(Path(__file__).resolve().parent.parent)) - -from _predicate_load_store_case import pk_us_compose, prefix_bits, write_case - - -SEED = 19 -ACTIVE_BITS = 145 - - -def generate(output_dir: Path, seed: int, src_elem_bytes: int) -> None: - del seed - del src_elem_bytes - write_case(output_dir, pk_us_compose(prefix_bits(ACTIVE_BITS))) - - -def main() -> None: - parser = argparse.ArgumentParser(description="Generate inputs/golden for psti-pk-pldi-us.") - parser.add_argument( - "--output-dir", - type=Path, - default=Path("."), - help="Directory where v1.bin/v2.bin/v3.bin/golden_v3.bin are written.", - ) - parser.add_argument("--seed", type=int, default=SEED, help="Numpy random seed.") - parser.add_argument( - "--src-elem-bytes", - type=int, - default=4, - help="Unused compatibility option kept for the shared runner surface.", - ) - args = parser.parse_args() - generate(args.output_dir, args.seed, args.src_elem_bytes) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/predicate-load-store/psti-pk-pldi-us/kernel.pto b/test/vpto/cases/micro-op/predicate-load-store/psti-pk-pldi-us/kernel.pto deleted file mode 100644 index db2f693f0..000000000 --- a/test/vpto/cases/micro-op/predicate-load-store/psti-pk-pldi-us/kernel.pto +++ /dev/null @@ -1,51 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/predicate-load-store/psti-pk-pldi-us -// family: predicate-load-store -// target_ops: pto.pldi, pto.psti -// scenarios: predicate-load-store-composition, immediate-offset, load-store-pair-preservation, representative-logical-elements -// ----------------------------------------------------------------------------- -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @psti_pk_pldi_us_kernel_2d(%arg0: !pto.ptr, %arg1: !pto.ptr, - %arg2: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c8 = arith.constant 8 : index - %c145 = arith.constant 145 : i32 - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c8192_i64 = arith.constant 8192 : i64 - %c10240_i64 = arith.constant 10240 : i64 - %false = arith.constant false - - %ub_mid = pto.castptr %c8192_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c10240_i64 : i64 -> !pto.ptr - pto.mte_gm_ub %arg1, %ub_mid, %c0_i64, %c32_i64 - nburst(%c32_i64, %c32_i64, %c32_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %_:1 = scf.for %iv = %c0 to %c1 step %c1 iter_args(%remaining = %c145) -> (i32) { - %src, %next = pto.plt_b8 %remaining : i32 -> !pto.mask, i32 - pto.psti %src, %ub_mid[%c8], "PK" : !pto.mask, !pto.ptr, index - pto.mem_bar "VST_VLD" - %loaded = pto.pldi %ub_mid[%c8], "US" : !pto.ptr, index -> !pto.mask - pto.psts %loaded, %ub_out[%c0], "NORM" : !pto.mask, !pto.ptr, index - scf.yield %next : i32 - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.set_loop1_stride_ubtoout %c32_i64, %c32_i64 : i64, i64 - pto.set_loop2_stride_ubtoout %c32_i64, %c32_i64 : i64, i64 - pto.mte_ub_gm %ub_out, %arg2, %c32_i64 - nburst(%c32_i64, %c32_i64, %c32_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/predicate-load-store/psti-pk-pldi-us/launch.cpp b/test/vpto/cases/micro-op/predicate-load-store/psti-pk-pldi-us/launch.cpp deleted file mode 100644 index 40a6df38d..000000000 --- a/test/vpto/cases/micro-op/predicate-load-store/psti-pk-pldi-us/launch.cpp +++ /dev/null @@ -1,48 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include - -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void psti_pk_pldi_us_kernel_2d(__gm__ float *v1, - __gm__ unsigned char *v2, - __gm__ unsigned char *v3); - -void LaunchPsti_pk_pldi_us_kernel_2d(float *v1, unsigned char *v2, - unsigned char *v3, void *stream) { - psti_pk_pldi_us_kernel_2d<<<1, nullptr, stream>>>((__gm__ float *)v1, - (__gm__ unsigned char *)v2, - (__gm__ unsigned char *)v3); -} diff --git a/test/vpto/cases/micro-op/predicate-load-store/psti-pk-pldi-us/main.cpp b/test/vpto/cases/micro-op/predicate-load-store/psti-pk-pldi-us/main.cpp deleted file mode 100644 index 0f2567d2c..000000000 --- a/test/vpto/cases/micro-op/predicate-load-store/psti-pk-pldi-us/main.cpp +++ /dev/null @@ -1,95 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/predicate-load-store/psti-pk-pldi-us -// ----------------------------------------------------------------------------- -#include "test_common.h" -#include "acl/acl.h" -#include -#include - -using namespace PtoTestCommon; - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, \ - (int)_ret, __FILE__, __LINE__); \ - const char *_recent = aclGetRecentErrMsg(); \ - if (_recent != nullptr && _recent[0] != '\0') \ - std::fprintf(stderr, "[ERROR] RecentErrMsg: %s\n", _recent); \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchPsti_pk_pldi_us_kernel_2d(float *v1, unsigned char *v2, - unsigned char *v3, void *stream); - -int main() { - size_t fileSize_v1 = 1024 * sizeof(float); - size_t fileSize_v2 = 1024 * sizeof(unsigned char); - size_t fileSize_v3 = 1024 * sizeof(unsigned char); - float *v1Host = nullptr; - float *v1Device = nullptr; - unsigned char *v2Host = nullptr; - unsigned char *v2Device = nullptr; - unsigned char *v3Host = nullptr; - unsigned char *v3Device = nullptr; - - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) - deviceId = std::atoi(envDevice); - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMallocHost((void **)(&v3Host), fileSize_v3)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v3Device, fileSize_v3, ACL_MEM_MALLOC_HUGE_FIRST)); - - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ReadFile("./v3.bin", fileSize_v3, v3Host, fileSize_v3); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v3Device, fileSize_v3, v3Host, fileSize_v3, ACL_MEMCPY_HOST_TO_DEVICE)); - - LaunchPsti_pk_pldi_us_kernel_2d(v1Device, v2Device, v3Device, stream); - - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v3Host, fileSize_v3, v3Device, fileSize_v3, ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v3.bin", v3Host, fileSize_v3); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFree(v3Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - aclrtFreeHost(v3Host); - if (stream != nullptr) - aclrtDestroyStream(stream); - if (deviceSet) - aclrtResetDevice(deviceId); - if (aclInited) - aclFinalize(); - return rc; -} diff --git a/test/vpto/cases/micro-op/predicate-load-store/psti-pk/kernel.pto b/test/vpto/cases/micro-op/predicate-load-store/psti-pk/kernel.pto index 3987d3e5e..368be6a71 100644 --- a/test/vpto/cases/micro-op/predicate-load-store/psti-pk/kernel.pto +++ b/test/vpto/cases/micro-op/predicate-load-store/psti-pk/kernel.pto @@ -1,34 +1,76 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/predicate-load-store/psti-pk -// family: predicate-load-store -// target_ops: pto.psti -// scenarios: packed-store, immediate-offset, representative-logical-elements -// ----------------------------------------------------------------------------- module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @psti_pk_kernel_2d(%arg0: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c145 = arith.constant 145 : i32 - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 + func.func @psti_pk_deep_merged_kernel(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: !pto.ptr) attributes {pto.kernel} { + %__deep_merge_guard = arith.constant false + // inactive merged from psti_pk_pldi_us_kernel_2d + scf.if %__deep_merge_guard { - %ub_out = pto.castptr %c0_i64 : i64 -> !pto.ptr - %gm_out = pto.castptr %arg0 : !pto.ptr -> !pto.ptr + %c0_m0 = arith.constant 0 : index + %c1_m0 = arith.constant 1 : index + %c8_m0 = arith.constant 8 : index + %c145_m0 = arith.constant 145 : i32 + %c0_i64_m0 = arith.constant 0 : i64 + %c1_i64_m0 = arith.constant 1 : i64 + %c32_i64_m0 = arith.constant 32 : i64 + %c8192_i64_m0 = arith.constant 8192 : i64 + %c10240_i64_m0 = arith.constant 10240 : i64 + %false_m0 = arith.constant false + + %ub_mid_m0 = pto.castptr %c8192_i64_m0 : i64 -> !pto.ptr + %ub_out_m0 = pto.castptr %c10240_i64_m0 : i64 -> !pto.ptr + pto.mte_gm_ub %arg1, %ub_mid_m0, %c0_i64_m0, %c32_i64_m0 + nburst(%c32_i64_m0, %c32_i64_m0, %c32_i64_m0) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %__m0:1 = scf.for %iv_m0 = %c0_m0 to %c1_m0 step %c1_m0 iter_args(%remaining_m0 = %c145_m0) -> (i32) { + %src_m0, %next_m0 = pto.plt_b8 %remaining_m0 : i32 -> !pto.mask, i32 + pto.psti %src_m0, %ub_mid_m0[%c8_m0], "PK" : !pto.mask, !pto.ptr, index + pto.mem_bar "VST_VLD" + %loaded_m0 = pto.pldi %ub_mid_m0[%c8_m0], "US" : !pto.ptr, index -> !pto.mask + pto.psts %loaded_m0, %ub_out_m0[%c0_m0], "NORM" : !pto.mask, !pto.ptr, index + scf.yield %next_m0 : i32 + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.set_loop1_stride_ubtoout %c32_i64_m0, %c32_i64_m0 : i64, i64 + pto.set_loop2_stride_ubtoout %c32_i64_m0, %c32_i64_m0 : i64, i64 + pto.mte_ub_gm %ub_out_m0, %arg2, %c32_i64_m0 + nburst(%c32_i64_m0, %c32_i64_m0, %c32_i64_m0) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + + } + // active merged from psti_pk_kernel_2d + + %c0_m1 = arith.constant 0 : index + %c1_m1 = arith.constant 1 : index + %c145_m1 = arith.constant 145 : i32 + %c0_i64_m1 = arith.constant 0 : i64 + %c1_i64_m1 = arith.constant 1 : i64 + %c32_i64_m1 = arith.constant 32 : i64 + + %ub_out_m1 = pto.castptr %c0_i64_m1 : i64 -> !pto.ptr + %gm_out_m1 = pto.castptr %arg3 : !pto.ptr -> !pto.ptr pto.vecscope { - scf.for %iter = %c0 to %c1 step %c1 { - %src, %next = pto.plt_b8 %c145 : i32 -> !pto.mask, i32 - pto.psti %src, %ub_out[%c0], "PK" : !pto.mask, !pto.ptr, index + scf.for %iter_m1 = %c0_m1 to %c1_m1 step %c1_m1 { + %src_m1, %next_m1 = pto.plt_b8 %c145_m1 : i32 -> !pto.mask, i32 + pto.psti %src_m1, %ub_out_m1[%c0_m1], "PK" : !pto.mask, !pto.ptr, index } } pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %gm_out, %c32_i64 - nburst(%c1_i64, %c32_i64, %c32_i64) + pto.mte_ub_gm %ub_out_m1, %gm_out_m1, %c32_i64_m1 + nburst(%c1_i64_m1, %c32_i64_m1, %c32_i64_m1) : !pto.ptr, !pto.ptr, i64, i64, i64, i64 pto.barrier #pto.pipe + return } } diff --git a/test/vpto/cases/micro-op/predicate-load-store/psti-pk/launch.cpp b/test/vpto/cases/micro-op/predicate-load-store/psti-pk/launch.cpp index 5be1e518d..510cb6d18 100644 --- a/test/vpto/cases/micro-op/predicate-load-store/psti-pk/launch.cpp +++ b/test/vpto/cases/micro-op/predicate-load-store/psti-pk/launch.cpp @@ -5,11 +5,9 @@ // THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, // INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. // See LICENSE in the root of the software repository for the full text of the License. - #ifndef __VEC_SCOPE__ #define __VEC_SCOPE__ #endif - #if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) typedef struct { unsigned char v; } hifloat8_t; typedef struct { unsigned char v; } float8_e4m3_t; @@ -19,25 +17,26 @@ typedef struct { unsigned char v; } float4_e1m2x2_t; typedef struct { unsigned char v; } float4_e2m1x2_t; #endif #include - #if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) #include #endif - #if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; +struct MrgSortExecutedNumList { uint16_t mrgSortList0,mrgSortList1,mrgSortList2,mrgSortList3; }; #endif #ifndef __CPU_SIM #include "acl/acl.h" #endif -extern "C" __global__ [aicore] void psti_pk_kernel_2d(__gm__ uint32_t *v1); +extern "C" __global__ [aicore] void psti_pk_deep_merged_kernel( + __gm__ float * arg0, + __gm__ uint8_t * arg1, + __gm__ uint8_t * arg2, + __gm__ uint32_t * arg3); -void LaunchPsti_pk_kernel_2d(uint32_t *v1, void *stream) { - psti_pk_kernel_2d<<<1, nullptr, stream>>>((__gm__ uint32_t *)v1); +void LaunchPstiPkDeepMerged(uint32_t * p0, void *stream) { + psti_pk_deep_merged_kernel<<<1, nullptr, stream>>>( + (__gm__ float *)p0, + (__gm__ uint8_t *)p0, + (__gm__ uint8_t *)p0, + (__gm__ uint32_t *)p0); } diff --git a/test/vpto/cases/micro-op/predicate-load-store/psti-pk/main.cpp b/test/vpto/cases/micro-op/predicate-load-store/psti-pk/main.cpp index 8be1b45c4..49f420afa 100644 --- a/test/vpto/cases/micro-op/predicate-load-store/psti-pk/main.cpp +++ b/test/vpto/cases/micro-op/predicate-load-store/psti-pk/main.cpp @@ -31,8 +31,8 @@ using namespace PtoTestCommon; } \ } while (0) -void LaunchPsti_pk_kernel_2d(uint32_t *v1, void *stream); +void LaunchPstiPkDeepMerged(uint32_t * p0, void *stream); int main() { size_t elemCount_v1 = 8; size_t fileSize_v1 = elemCount_v1 * sizeof(uint32_t); @@ -59,7 +59,7 @@ int main() { ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchPsti_pk_kernel_2d(v1Device, stream); + LaunchPstiPkDeepMerged(v1Device, stream); ACL_CHECK(aclrtSynchronizeStream(stream)); ACL_CHECK(aclrtMemcpy(v1Host, fileSize_v1, v1Device, fileSize_v1, ACL_MEMCPY_DEVICE_TO_HOST)); diff --git a/test/vpto/cases/micro-op/predicate-load-store/psts-norm-plds-ds/compare.py b/test/vpto/cases/micro-op/predicate-load-store/psts-norm-plds-ds/compare.py deleted file mode 100644 index 39299f639..000000000 --- a/test/vpto/cases/micro-op/predicate-load-store/psts-norm-plds-ds/compare.py +++ /dev/null @@ -1,37 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/predicate-load-store/psts-norm-plds-ds -# family: predicate-load-store -# target_ops: pto.plds, pto.psts -# scenarios: predicate-load-store-composition, dynamic-offset, load-store-pair-preservation, representative-logical-elements - -import os -import sys -from pathlib import Path - -sys.path.append(str(Path(__file__).resolve().parent.parent)) - -from _predicate_load_store_case import compare_norm_store - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_norm_store("golden_v3.bin", "v3.bin") - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/predicate-load-store/psts-norm-plds-ds/golden.py b/test/vpto/cases/micro-op/predicate-load-store/psts-norm-plds-ds/golden.py deleted file mode 100644 index 19a16409e..000000000 --- a/test/vpto/cases/micro-op/predicate-load-store/psts-norm-plds-ds/golden.py +++ /dev/null @@ -1,42 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/predicate-load-store/psts-norm-plds-ds -# family: predicate-load-store -# target_ops: pto.plds, pto.psts -# scenarios: predicate-load-store-composition, dynamic-offset, load-store-pair-preservation, representative-logical-elements - -import argparse -from pathlib import Path -import sys - -sys.path.append(str(Path(__file__).resolve().parent.parent)) - -from _predicate_load_store_case import norm_ds_compose, prefix_bits, write_case - - -SEED = 19 -ACTIVE_BITS = 175 - - -def generate(output_dir: Path, seed: int) -> None: - del seed - write_case(output_dir, norm_ds_compose(prefix_bits(ACTIVE_BITS))) - - -def main() -> None: - parser = argparse.ArgumentParser(description="Generate inputs/golden for psts-norm-plds-ds.") - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/predicate-load-store/psts-norm-plds-ds/kernel.pto b/test/vpto/cases/micro-op/predicate-load-store/psts-norm-plds-ds/kernel.pto deleted file mode 100644 index dac3c026f..000000000 --- a/test/vpto/cases/micro-op/predicate-load-store/psts-norm-plds-ds/kernel.pto +++ /dev/null @@ -1,56 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/predicate-load-store/psts-norm-plds-ds -// family: predicate-load-store -// target_ops: pto.plds, pto.psts -// scenarios: predicate-load-store-composition, dynamic-offset, load-store-pair-preservation, representative-logical-elements -// ----------------------------------------------------------------------------- -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @psts_norm_plds_ds_kernel_2d(%arg0: !pto.ptr, %arg1: !pto.ptr, - %arg2: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c32 = arith.constant 32 : index - %c64 = arith.constant 64 : index - %c175 = arith.constant 175 : i32 - %c0_i32 = arith.constant 0 : i32 - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c8192_i64 = arith.constant 8192 : i64 - %c10240_i64 = arith.constant 10240 : i64 - %false = arith.constant false - - %ub_mid = pto.castptr %c8192_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c10240_i64 : i64 -> !pto.ptr - pto.mte_gm_ub %arg1, %ub_mid, %c0_i64, %c32_i64 - nburst(%c32_i64, %c32_i64, %c32_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %_:1 = scf.for %iv = %c0 to %c1 step %c1 iter_args(%remaining = %c175) -> (i32) { - %byte_offset = arith.addi %iv, %c32 : index - %src, %next = pto.plt_b8 %remaining : i32 -> !pto.mask, i32 - %zero, %_unused = pto.plt_b8 %c0_i32 : i32 -> !pto.mask, i32 - pto.psts %src, %ub_mid[%byte_offset], "NORM" : !pto.mask, !pto.ptr, index - pto.psts %zero, %ub_mid[%c64], "NORM" : !pto.mask, !pto.ptr, index - pto.mem_bar "VST_VLD" - %loaded = pto.plds %ub_mid[%byte_offset], "DS" : !pto.ptr, index -> !pto.mask - pto.psts %loaded, %ub_out[%c0], "NORM" : !pto.mask, !pto.ptr, index - scf.yield %next : i32 - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.set_loop1_stride_ubtoout %c32_i64, %c32_i64 : i64, i64 - pto.set_loop2_stride_ubtoout %c32_i64, %c32_i64 : i64, i64 - pto.mte_ub_gm %ub_out, %arg2, %c32_i64 - nburst(%c32_i64, %c32_i64, %c32_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/predicate-load-store/psts-norm-plds-ds/launch.cpp b/test/vpto/cases/micro-op/predicate-load-store/psts-norm-plds-ds/launch.cpp deleted file mode 100644 index 7c9a920cc..000000000 --- a/test/vpto/cases/micro-op/predicate-load-store/psts-norm-plds-ds/launch.cpp +++ /dev/null @@ -1,48 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include - -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void psts_norm_plds_ds_kernel_2d(__gm__ float *v1, - __gm__ unsigned char *v2, - __gm__ unsigned char *v3); - -void LaunchPsts_norm_plds_ds_kernel_2d(float *v1, unsigned char *v2, - unsigned char *v3, void *stream) { - psts_norm_plds_ds_kernel_2d<<<1, nullptr, stream>>>((__gm__ float *)v1, - (__gm__ unsigned char *)v2, - (__gm__ unsigned char *)v3); -} diff --git a/test/vpto/cases/micro-op/predicate-load-store/psts-norm-plds-ds/main.cpp b/test/vpto/cases/micro-op/predicate-load-store/psts-norm-plds-ds/main.cpp deleted file mode 100644 index 2f0be35a7..000000000 --- a/test/vpto/cases/micro-op/predicate-load-store/psts-norm-plds-ds/main.cpp +++ /dev/null @@ -1,95 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/predicate-load-store/psts-norm-plds-ds -// ----------------------------------------------------------------------------- -#include "test_common.h" -#include "acl/acl.h" -#include -#include - -using namespace PtoTestCommon; - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, \ - (int)_ret, __FILE__, __LINE__); \ - const char *_recent = aclGetRecentErrMsg(); \ - if (_recent != nullptr && _recent[0] != '\0') \ - std::fprintf(stderr, "[ERROR] RecentErrMsg: %s\n", _recent); \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchPsts_norm_plds_ds_kernel_2d(float *v1, unsigned char *v2, - unsigned char *v3, void *stream); - -int main() { - size_t fileSize_v1 = 1024 * sizeof(float); - size_t fileSize_v2 = 1024 * sizeof(unsigned char); - size_t fileSize_v3 = 1024 * sizeof(unsigned char); - float *v1Host = nullptr; - float *v1Device = nullptr; - unsigned char *v2Host = nullptr; - unsigned char *v2Device = nullptr; - unsigned char *v3Host = nullptr; - unsigned char *v3Device = nullptr; - - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) - deviceId = std::atoi(envDevice); - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMallocHost((void **)(&v3Host), fileSize_v3)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v3Device, fileSize_v3, ACL_MEM_MALLOC_HUGE_FIRST)); - - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ReadFile("./v3.bin", fileSize_v3, v3Host, fileSize_v3); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v3Device, fileSize_v3, v3Host, fileSize_v3, ACL_MEMCPY_HOST_TO_DEVICE)); - - LaunchPsts_norm_plds_ds_kernel_2d(v1Device, v2Device, v3Device, stream); - - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v3Host, fileSize_v3, v3Device, fileSize_v3, ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v3.bin", v3Host, fileSize_v3); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFree(v3Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - aclrtFreeHost(v3Host); - if (stream != nullptr) - aclrtDestroyStream(stream); - if (deviceSet) - aclrtResetDevice(deviceId); - if (aclInited) - aclFinalize(); - return rc; -} diff --git a/test/vpto/cases/micro-op/predicate-load-store/psts-pk-plds-us-prefix-boundary/compare.py b/test/vpto/cases/micro-op/predicate-load-store/psts-pk-plds-us-prefix-boundary/compare.py deleted file mode 100644 index fe48e2bdb..000000000 --- a/test/vpto/cases/micro-op/predicate-load-store/psts-pk-plds-us-prefix-boundary/compare.py +++ /dev/null @@ -1,37 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/predicate-load-store/psts-pk-plds-us-prefix-boundary -# family: predicate-load-store -# target_ops: pto.plds, pto.psts -# scenarios: predicate-load-store-composition, dynamic-offset, load-store-pair-preservation, representative-logical-elements - -import os -import sys -from pathlib import Path - -sys.path.append(str(Path(__file__).resolve().parent.parent)) - -from _predicate_load_store_case import compare_norm_store - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_norm_store("golden_v3.bin", "v3.bin") - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/predicate-load-store/psts-pk-plds-us-prefix-boundary/golden.py b/test/vpto/cases/micro-op/predicate-load-store/psts-pk-plds-us-prefix-boundary/golden.py deleted file mode 100644 index f2ab5e6e3..000000000 --- a/test/vpto/cases/micro-op/predicate-load-store/psts-pk-plds-us-prefix-boundary/golden.py +++ /dev/null @@ -1,44 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/predicate-load-store/psts-pk-plds-us-prefix-boundary -# family: predicate-load-store -# target_ops: pto.plds, pto.psts -# scenarios: predicate-load-store-composition, dynamic-offset, load-store-pair-preservation, representative-logical-elements - -import argparse -from pathlib import Path -import sys - -sys.path.append(str(Path(__file__).resolve().parent.parent)) - -from _predicate_load_store_case import pk_us_compose, prefix_bits, write_case - - -SEED = 19 -ACTIVE_BITS = 173 - - -def generate(output_dir: Path, seed: int) -> None: - del seed - write_case(output_dir, pk_us_compose(prefix_bits(ACTIVE_BITS))) - - -def main() -> None: - parser = argparse.ArgumentParser( - description="Generate inputs/golden for psts-pk-plds-us-prefix-boundary." - ) - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/predicate-load-store/psts-pk-plds-us-prefix-boundary/kernel.pto b/test/vpto/cases/micro-op/predicate-load-store/psts-pk-plds-us-prefix-boundary/kernel.pto deleted file mode 100644 index c6f014273..000000000 --- a/test/vpto/cases/micro-op/predicate-load-store/psts-pk-plds-us-prefix-boundary/kernel.pto +++ /dev/null @@ -1,53 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/predicate-load-store/psts-pk-plds-us-prefix-boundary -// family: predicate-load-store -// target_ops: pto.plds, pto.psts -// scenarios: predicate-load-store-composition, dynamic-offset, load-store-pair-preservation, representative-logical-elements -// ----------------------------------------------------------------------------- -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @psts_pk_plds_us_prefix_boundary_kernel_2d(%arg0: !pto.ptr, - %arg1: !pto.ptr, - %arg2: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c16 = arith.constant 16 : index - %c173 = arith.constant 173 : i32 - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c8192_i64 = arith.constant 8192 : i64 - %c10240_i64 = arith.constant 10240 : i64 - %false = arith.constant false - - %ub_mid = pto.castptr %c8192_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c10240_i64 : i64 -> !pto.ptr - pto.mte_gm_ub %arg1, %ub_mid, %c0_i64, %c32_i64 - nburst(%c32_i64, %c32_i64, %c32_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %_:1 = scf.for %iv = %c0 to %c1 step %c1 iter_args(%remaining = %c173) -> (i32) { - %byte_offset = arith.addi %iv, %c16 : index - %src, %next = pto.plt_b8 %remaining : i32 -> !pto.mask, i32 - pto.psts %src, %ub_mid[%byte_offset], "PK" : !pto.mask, !pto.ptr, index - pto.mem_bar "VST_VLD" - %loaded = pto.plds %ub_mid[%byte_offset], "US" : !pto.ptr, index -> !pto.mask - pto.psts %loaded, %ub_out[%c0], "NORM" : !pto.mask, !pto.ptr, index - scf.yield %next : i32 - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.set_loop1_stride_ubtoout %c32_i64, %c32_i64 : i64, i64 - pto.set_loop2_stride_ubtoout %c32_i64, %c32_i64 : i64, i64 - pto.mte_ub_gm %ub_out, %arg2, %c32_i64 - nburst(%c32_i64, %c32_i64, %c32_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/predicate-load-store/psts-pk-plds-us-prefix-boundary/launch.cpp b/test/vpto/cases/micro-op/predicate-load-store/psts-pk-plds-us-prefix-boundary/launch.cpp deleted file mode 100644 index a2d8377c7..000000000 --- a/test/vpto/cases/micro-op/predicate-load-store/psts-pk-plds-us-prefix-boundary/launch.cpp +++ /dev/null @@ -1,47 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include - -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void psts_pk_plds_us_prefix_boundary_kernel_2d(__gm__ float *v1, - __gm__ unsigned char *v2, - __gm__ unsigned char *v3); - -void LaunchPsts_pk_plds_us_prefix_boundary_kernel_2d(float *v1, unsigned char *v2, - unsigned char *v3, void *stream) { - psts_pk_plds_us_prefix_boundary_kernel_2d<<<1, nullptr, stream>>>( - (__gm__ float *)v1, (__gm__ unsigned char *)v2, (__gm__ unsigned char *)v3); -} diff --git a/test/vpto/cases/micro-op/predicate-load-store/psts-pk-plds-us-prefix-boundary/main.cpp b/test/vpto/cases/micro-op/predicate-load-store/psts-pk-plds-us-prefix-boundary/main.cpp deleted file mode 100644 index d76ca6ac1..000000000 --- a/test/vpto/cases/micro-op/predicate-load-store/psts-pk-plds-us-prefix-boundary/main.cpp +++ /dev/null @@ -1,95 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/predicate-load-store/psts-pk-plds-us-prefix-boundary -// ----------------------------------------------------------------------------- -#include "test_common.h" -#include "acl/acl.h" -#include -#include - -using namespace PtoTestCommon; - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, \ - (int)_ret, __FILE__, __LINE__); \ - const char *_recent = aclGetRecentErrMsg(); \ - if (_recent != nullptr && _recent[0] != '\0') \ - std::fprintf(stderr, "[ERROR] RecentErrMsg: %s\n", _recent); \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchPsts_pk_plds_us_prefix_boundary_kernel_2d(float *v1, unsigned char *v2, - unsigned char *v3, void *stream); - -int main() { - size_t fileSize_v1 = 1024 * sizeof(float); - size_t fileSize_v2 = 1024 * sizeof(unsigned char); - size_t fileSize_v3 = 1024 * sizeof(unsigned char); - float *v1Host = nullptr; - float *v1Device = nullptr; - unsigned char *v2Host = nullptr; - unsigned char *v2Device = nullptr; - unsigned char *v3Host = nullptr; - unsigned char *v3Device = nullptr; - - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) - deviceId = std::atoi(envDevice); - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMallocHost((void **)(&v3Host), fileSize_v3)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v3Device, fileSize_v3, ACL_MEM_MALLOC_HUGE_FIRST)); - - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ReadFile("./v3.bin", fileSize_v3, v3Host, fileSize_v3); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v3Device, fileSize_v3, v3Host, fileSize_v3, ACL_MEMCPY_HOST_TO_DEVICE)); - - LaunchPsts_pk_plds_us_prefix_boundary_kernel_2d(v1Device, v2Device, v3Device, stream); - - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v3Host, fileSize_v3, v3Device, fileSize_v3, ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v3.bin", v3Host, fileSize_v3); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFree(v3Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - aclrtFreeHost(v3Host); - if (stream != nullptr) - aclrtDestroyStream(stream); - if (deviceSet) - aclrtResetDevice(deviceId); - if (aclInited) - aclFinalize(); - return rc; -} diff --git a/test/vpto/cases/micro-op/predicate-load-store/psts-pk-plds-us/kernel.pto b/test/vpto/cases/micro-op/predicate-load-store/psts-pk-plds-us/kernel.pto index 6af9f336a..22181c10b 100644 --- a/test/vpto/cases/micro-op/predicate-load-store/psts-pk-plds-us/kernel.pto +++ b/test/vpto/cases/micro-op/predicate-load-store/psts-pk-plds-us/kernel.pto @@ -1,52 +1,94 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/predicate-load-store/psts-pk-plds-us -// family: predicate-load-store -// target_ops: pto.plds, pto.psts -// scenarios: predicate-load-store-composition, dynamic-offset, load-store-pair-preservation, representative-logical-elements -// ----------------------------------------------------------------------------- module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @psts_pk_plds_us_kernel_2d(%arg0: !pto.ptr, %arg1: !pto.ptr, - %arg2: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c16 = arith.constant 16 : index - %c171 = arith.constant 171 : i32 - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c8192_i64 = arith.constant 8192 : i64 - %c10240_i64 = arith.constant 10240 : i64 - %false = arith.constant false - - %ub_mid = pto.castptr %c8192_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c10240_i64 : i64 -> !pto.ptr - pto.mte_gm_ub %arg1, %ub_mid, %c0_i64, %c32_i64 - nburst(%c32_i64, %c32_i64, %c32_i64) + func.func @psts_pk_plds_us_deep_merged_kernel(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: !pto.ptr, %arg4: !pto.ptr, %arg5: !pto.ptr) attributes {pto.kernel} { + %__deep_merge_guard = arith.constant false + // inactive merged from psts_pk_plds_us_prefix_boundary_kernel_2d + scf.if %__deep_merge_guard { + + %c0_m0 = arith.constant 0 : index + %c1_m0 = arith.constant 1 : index + %c16_m0 = arith.constant 16 : index + %c173_m0 = arith.constant 173 : i32 + %c0_i64_m0 = arith.constant 0 : i64 + %c1_i64_m0 = arith.constant 1 : i64 + %c32_i64_m0 = arith.constant 32 : i64 + %c8192_i64_m0 = arith.constant 8192 : i64 + %c10240_i64_m0 = arith.constant 10240 : i64 + %false_m0 = arith.constant false + + %ub_mid_m0 = pto.castptr %c8192_i64_m0 : i64 -> !pto.ptr + %ub_out_m0 = pto.castptr %c10240_i64_m0 : i64 -> !pto.ptr + pto.mte_gm_ub %arg1, %ub_mid_m0, %c0_i64_m0, %c32_i64_m0 + nburst(%c32_i64_m0, %c32_i64_m0, %c32_i64_m0) : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] pto.vecscope { - %_:1 = scf.for %iv = %c0 to %c1 step %c1 iter_args(%remaining = %c171) -> (i32) { - %byte_offset = arith.addi %iv, %c16 : index - %src, %next = pto.plt_b8 %remaining : i32 -> !pto.mask, i32 - pto.psts %src, %ub_mid[%byte_offset], "PK" : !pto.mask, !pto.ptr, index + %__m0:1 = scf.for %iv_m0 = %c0_m0 to %c1_m0 step %c1_m0 iter_args(%remaining_m0 = %c173_m0) -> (i32) { + %byte_offset_m0 = arith.addi %iv_m0, %c16_m0 : index + %src_m0, %next_m0 = pto.plt_b8 %remaining_m0 : i32 -> !pto.mask, i32 + pto.psts %src_m0, %ub_mid_m0[%byte_offset_m0], "PK" : !pto.mask, !pto.ptr, index pto.mem_bar "VST_VLD" - %loaded = pto.plds %ub_mid[%byte_offset], "US" : !pto.ptr, index -> !pto.mask - pto.psts %loaded, %ub_out[%c0], "NORM" : !pto.mask, !pto.ptr, index - scf.yield %next : i32 + %loaded_m0 = pto.plds %ub_mid_m0[%byte_offset_m0], "US" : !pto.ptr, index -> !pto.mask + pto.psts %loaded_m0, %ub_out_m0[%c0_m0], "NORM" : !pto.mask, !pto.ptr, index + scf.yield %next_m0 : i32 } } pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.set_loop1_stride_ubtoout %c32_i64, %c32_i64 : i64, i64 - pto.set_loop2_stride_ubtoout %c32_i64, %c32_i64 : i64, i64 - pto.mte_ub_gm %ub_out, %arg2, %c32_i64 - nburst(%c32_i64, %c32_i64, %c32_i64) + pto.set_loop1_stride_ubtoout %c32_i64_m0, %c32_i64_m0 : i64, i64 + pto.set_loop2_stride_ubtoout %c32_i64_m0, %c32_i64_m0 : i64, i64 + pto.mte_ub_gm %ub_out_m0, %arg2, %c32_i64_m0 + nburst(%c32_i64_m0, %c32_i64_m0, %c32_i64_m0) : !pto.ptr, !pto.ptr, i64, i64, i64, i64 pto.barrier #pto.pipe + + } + // active merged from psts_pk_plds_us_kernel_2d + + %c0_m1 = arith.constant 0 : index + %c1_m1 = arith.constant 1 : index + %c16_m1 = arith.constant 16 : index + %c171_m1 = arith.constant 171 : i32 + %c0_i64_m1 = arith.constant 0 : i64 + %c1_i64_m1 = arith.constant 1 : i64 + %c32_i64_m1 = arith.constant 32 : i64 + %c8192_i64_m1 = arith.constant 8192 : i64 + %c10240_i64_m1 = arith.constant 10240 : i64 + %false_m1 = arith.constant false + + %ub_mid_m1 = pto.castptr %c8192_i64_m1 : i64 -> !pto.ptr + %ub_out_m1 = pto.castptr %c10240_i64_m1 : i64 -> !pto.ptr + pto.mte_gm_ub %arg4, %ub_mid_m1, %c0_i64_m1, %c32_i64_m1 + nburst(%c32_i64_m1, %c32_i64_m1, %c32_i64_m1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %__m1:1 = scf.for %iv_m1 = %c0_m1 to %c1_m1 step %c1_m1 iter_args(%remaining_m1 = %c171_m1) -> (i32) { + %byte_offset_m1 = arith.addi %iv_m1, %c16_m1 : index + %src_m1, %next_m1 = pto.plt_b8 %remaining_m1 : i32 -> !pto.mask, i32 + pto.psts %src_m1, %ub_mid_m1[%byte_offset_m1], "PK" : !pto.mask, !pto.ptr, index + pto.mem_bar "VST_VLD" + %loaded_m1 = pto.plds %ub_mid_m1[%byte_offset_m1], "US" : !pto.ptr, index -> !pto.mask + pto.psts %loaded_m1, %ub_out_m1[%c0_m1], "NORM" : !pto.mask, !pto.ptr, index + scf.yield %next_m1 : i32 + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.set_loop1_stride_ubtoout %c32_i64_m1, %c32_i64_m1 : i64, i64 + pto.set_loop2_stride_ubtoout %c32_i64_m1, %c32_i64_m1 : i64, i64 + pto.mte_ub_gm %ub_out_m1, %arg5, %c32_i64_m1 + nburst(%c32_i64_m1, %c32_i64_m1, %c32_i64_m1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + return } } diff --git a/test/vpto/cases/micro-op/predicate-load-store/psts-pk-plds-us/launch.cpp b/test/vpto/cases/micro-op/predicate-load-store/psts-pk-plds-us/launch.cpp index acc8dfb7b..fef5559d1 100644 --- a/test/vpto/cases/micro-op/predicate-load-store/psts-pk-plds-us/launch.cpp +++ b/test/vpto/cases/micro-op/predicate-load-store/psts-pk-plds-us/launch.cpp @@ -5,11 +5,9 @@ // THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, // INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. // See LICENSE in the root of the software repository for the full text of the License. - #ifndef __VEC_SCOPE__ #define __VEC_SCOPE__ #endif - #if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) typedef struct { unsigned char v; } hifloat8_t; typedef struct { unsigned char v; } float8_e4m3_t; @@ -19,30 +17,30 @@ typedef struct { unsigned char v; } float4_e1m2x2_t; typedef struct { unsigned char v; } float4_e2m1x2_t; #endif #include - #if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) #include #endif - #if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; +struct MrgSortExecutedNumList { uint16_t mrgSortList0,mrgSortList1,mrgSortList2,mrgSortList3; }; #endif #ifndef __CPU_SIM #include "acl/acl.h" #endif -extern "C" __global__ [aicore] void psts_pk_plds_us_kernel_2d(__gm__ float *v1, - __gm__ unsigned char *v2, - __gm__ unsigned char *v3); +extern "C" __global__ [aicore] void psts_pk_plds_us_deep_merged_kernel( + __gm__ float * arg0, + __gm__ uint8_t * arg1, + __gm__ uint8_t * arg2, + __gm__ float * arg3, + __gm__ uint8_t * arg4, + __gm__ uint8_t * arg5); -void LaunchPsts_pk_plds_us_kernel_2d(float *v1, unsigned char *v2, - unsigned char *v3, void *stream) { - psts_pk_plds_us_kernel_2d<<<1, nullptr, stream>>>((__gm__ float *)v1, - (__gm__ unsigned char *)v2, - (__gm__ unsigned char *)v3); +void LaunchPstsPkPldsUsDeepMerged(float * p0, unsigned char * p1, unsigned char * p2, void *stream) { + psts_pk_plds_us_deep_merged_kernel<<<1, nullptr, stream>>>( + (__gm__ float *)p0, + (__gm__ uint8_t *)p0, + (__gm__ uint8_t *)p0, + (__gm__ float *)p0, + (__gm__ uint8_t *)p1, + (__gm__ uint8_t *)p2); } diff --git a/test/vpto/cases/micro-op/predicate-load-store/psts-pk-plds-us/main.cpp b/test/vpto/cases/micro-op/predicate-load-store/psts-pk-plds-us/main.cpp index 1462814f4..54a9f2a13 100644 --- a/test/vpto/cases/micro-op/predicate-load-store/psts-pk-plds-us/main.cpp +++ b/test/vpto/cases/micro-op/predicate-load-store/psts-pk-plds-us/main.cpp @@ -30,9 +30,8 @@ using namespace PtoTestCommon; } \ } while (0) -void LaunchPsts_pk_plds_us_kernel_2d(float *v1, unsigned char *v2, - unsigned char *v3, void *stream); +void LaunchPstsPkPldsUsDeepMerged(float * p0, unsigned char * p1, unsigned char * p2, void *stream); int main() { size_t fileSize_v1 = 1024 * sizeof(float); size_t fileSize_v2 = 1024 * sizeof(unsigned char); @@ -72,7 +71,7 @@ int main() { ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, ACL_MEMCPY_HOST_TO_DEVICE)); ACL_CHECK(aclrtMemcpy(v3Device, fileSize_v3, v3Host, fileSize_v3, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchPsts_pk_plds_us_kernel_2d(v1Device, v2Device, v3Device, stream); + LaunchPstsPkPldsUsDeepMerged(v1Device, v2Device, v3Device, stream); ACL_CHECK(aclrtSynchronizeStream(stream)); ACL_CHECK(aclrtMemcpy(v3Host, fileSize_v3, v3Device, fileSize_v3, ACL_MEMCPY_DEVICE_TO_HOST)); diff --git a/test/vpto/cases/micro-op/predicate-load-store/pstu-init-align-outside-loop/compare.py b/test/vpto/cases/micro-op/predicate-load-store/pstu-init-align-outside-loop/compare.py deleted file mode 100644 index 845f5233e..000000000 --- a/test/vpto/cases/micro-op/predicate-load-store/pstu-init-align-outside-loop/compare.py +++ /dev/null @@ -1,50 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/predicate-load-store/pstu-init-align-outside-loop -# family: predicate-load-store -# target_ops: pto.pstu -# scenarios: unaligned-predicate-store, state-update, representative-logical-elements, init-align-outside-loop - -import os -import sys -import numpy as np - -EXPECTED_WORDS = 8 - - -def compare_packed_pred_mask(golden_path, output_path): - if not os.path.exists(golden_path) or not os.path.exists(output_path): - return False - golden = np.fromfile(golden_path, dtype=np.uint32) - output = np.fromfile(output_path, dtype=np.uint32) - if golden.size != EXPECTED_WORDS or output.size != EXPECTED_WORDS: - return False - if not np.array_equal(golden, output): - diff = np.nonzero(golden != output)[0] - idx = int(diff[0]) if diff.size else 0 - print(f"[ERROR] Mismatch (packed mask words): idx={idx} golden={int(golden[idx])} out={int(output[idx])}") - return False - return True - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_packed_pred_mask("golden_v3.bin", "v3.bin") - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/predicate-load-store/pstu-init-align-outside-loop/golden.py b/test/vpto/cases/micro-op/predicate-load-store/pstu-init-align-outside-loop/golden.py deleted file mode 100644 index bf0e5a2ab..000000000 --- a/test/vpto/cases/micro-op/predicate-load-store/pstu-init-align-outside-loop/golden.py +++ /dev/null @@ -1,83 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/predicate-load-store/pstu-init-align-outside-loop -# family: predicate-load-store -# target_ops: pto.pstu -# scenarios: unaligned-predicate-store, state-update, representative-logical-elements, init-align-outside-loop -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -ROWS = 32 -COLS = 32 -SEED = 19 -PACKED_BYTES_PER_STORE = 8 -OUTPUT_WORDS = 8 - - -def _pack_mask_b32(active_lanes: int) -> np.ndarray: - if active_lanes < 0 or active_lanes > 64: - raise ValueError(f"active_lanes must be in [0, 64], got {active_lanes}") - logical = np.zeros((64,), dtype=np.uint8) - logical[:active_lanes] = 1 - packed = np.packbits(logical, bitorder="little") - out = np.zeros((PACKED_BYTES_PER_STORE,), dtype=np.uint8) - out[: packed.size] = packed - return out - - -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) - v1 = rng.uniform(-3.0, 3.0, size=(ROWS, COLS)).astype(np.float32) - v2 = rng.uniform(-1.0, 1.0, size=(ROWS, COLS)).astype(np.float32) - - first = _pack_mask_b32(13) - second = _pack_mask_b32(7) - packed = np.zeros((OUTPUT_WORDS * np.dtype(np.uint32).itemsize,), dtype=np.uint8) - packed[:PACKED_BYTES_PER_STORE] = first - packed[PACKED_BYTES_PER_STORE : 2 * PACKED_BYTES_PER_STORE] = second - packed[2 * PACKED_BYTES_PER_STORE : 3 * PACKED_BYTES_PER_STORE] = first - packed[3 * PACKED_BYTES_PER_STORE : 4 * PACKED_BYTES_PER_STORE] = second - output_init = np.zeros((OUTPUT_WORDS,), dtype=np.uint32) - - output_dir.mkdir(parents=True, exist_ok=True) - v1.reshape(-1).tofile(output_dir / "v1.bin") - v2.reshape(-1).tofile(output_dir / "v2.bin") - output_init.tofile(output_dir / "v3.bin") - packed.view(np.uint32).tofile(output_dir / "golden_v3.bin") - - -def main() -> None: - parser = argparse.ArgumentParser( - description="Generate numpy-based inputs/golden for VPTO micro-op pstu-init-align-outside-loop validation." - ) - parser.add_argument( - "--output-dir", - type=Path, - default=Path("."), - help="Directory where v1.bin/v2.bin/v3.bin/golden_v3.bin are written.", - ) - parser.add_argument( - "--seed", - type=int, - default=SEED, - help="Numpy random seed.", - ) - args = parser.parse_args() - - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/predicate-load-store/pstu-init-align-outside-loop/kernel.pto b/test/vpto/cases/micro-op/predicate-load-store/pstu-init-align-outside-loop/kernel.pto deleted file mode 100644 index 474a9011f..000000000 --- a/test/vpto/cases/micro-op/predicate-load-store/pstu-init-align-outside-loop/kernel.pto +++ /dev/null @@ -1,46 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/predicate-load-store/pstu-init-align-outside-loop -// family: predicate-load-store -// target_ops: pto.pstu -// scenarios: unaligned-predicate-store, state-update, representative-logical-elements, init-align-outside-loop -// ----------------------------------------------------------------------------- -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @pstu_init_align_outside_loop_kernel_2d(%arg0: !pto.ptr, - %arg1: !pto.ptr, - %arg2: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c2 = arith.constant 2 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c8_i64 = arith.constant 8 : i64 - %c32_i64 = arith.constant 32 : i64 - %c0_i32 = arith.constant 0 : i32 - %c13 = arith.constant 13 : i32 - %c7 = arith.constant 7 : i32 - - %ub_mask = pto.castptr %c0_i64 : i64 -> !pto.ptr - - pto.vecscope { - %align_init = pto.init_align : !pto.align - %align_final, %base_final = scf.for %iter = %c0 to %c2 step %c1 - iter_args(%align_iter = %align_init, %base_iter = %ub_mask) - -> (!pto.align, !pto.ptr) { - %value, %next = pto.plt_b32 %c13 : i32 -> !pto.mask, i32 - %align_out, %base_out = pto.pstu %align_iter, %value, %base_iter : !pto.align, !pto.mask, !pto.ptr -> !pto.align, !pto.ptr - %value_tail, %next_tail = pto.plt_b32 %c7 : i32 -> !pto.mask, i32 - %align_tail, %base_tail = pto.pstu %align_out, %value_tail, %base_out : !pto.align, !pto.mask, !pto.ptr -> !pto.align, !pto.ptr - scf.yield %align_tail, %base_tail : !pto.align, !pto.ptr - } - pto.vstas %align_final, %base_final, %c0_i32 : !pto.align, !pto.ptr, i32 - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_mask, %arg2, %c32_i64 - nburst(%c1_i64, %c32_i64, %c32_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/predicate-load-store/pstu-init-align-outside-loop/launch.cpp b/test/vpto/cases/micro-op/predicate-load-store/pstu-init-align-outside-loop/launch.cpp deleted file mode 100644 index 1d97093b5..000000000 --- a/test/vpto/cases/micro-op/predicate-load-store/pstu-init-align-outside-loop/launch.cpp +++ /dev/null @@ -1,55 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/predicate-load-store/pstu-init-align-outside-loop -// family: predicate-load-store -// target_ops: pto.pstu -// scenarios: unaligned-predicate-store, state-update, representative-logical-elements, init-align-outside-loop -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include - -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void -pstu_init_align_outside_loop_kernel_2d(__gm__ float *v1, __gm__ float *v2, - __gm__ uint32_t *v3); - -void LaunchPstu_init_align_outside_loop_kernel_2d(float *v1, float *v2, - uint32_t *v3, void *stream) { - pstu_init_align_outside_loop_kernel_2d<<<1, nullptr, stream>>>( - (__gm__ float *)v1, (__gm__ float *)v2, (__gm__ uint32_t *)v3); -} diff --git a/test/vpto/cases/micro-op/predicate-load-store/pstu-init-align-outside-loop/main.cpp b/test/vpto/cases/micro-op/predicate-load-store/pstu-init-align-outside-loop/main.cpp deleted file mode 100644 index ad4157e2f..000000000 --- a/test/vpto/cases/micro-op/predicate-load-store/pstu-init-align-outside-loop/main.cpp +++ /dev/null @@ -1,112 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/predicate-load-store/pstu-init-align-outside-loop -// family: predicate-load-store -// target_ops: pto.pstu -// scenarios: unaligned-predicate-store, state-update, representative-logical-elements, init-align-outside-loop -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include - -using namespace PtoTestCommon; - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, \ - (int)_ret, __FILE__, __LINE__); \ - const char *_recent = aclGetRecentErrMsg(); \ - if (_recent != nullptr && _recent[0] != '\0') \ - std::fprintf(stderr, "[ERROR] RecentErrMsg: %s\n", _recent); \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchPstu_init_align_outside_loop_kernel_2d(float *v1, float *v2, - uint32_t *v3, void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(float); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(float); - size_t elemCount_v3 = 8; - size_t fileSize_v3 = elemCount_v3 * sizeof(uint32_t); - float *v1Host = nullptr; - float *v1Device = nullptr; - float *v2Host = nullptr; - float *v2Device = nullptr; - uint32_t *v3Host = nullptr; - uint32_t *v3Device = nullptr; - - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) - deviceId = std::atoi(envDevice); - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMallocHost((void **)(&v3Host), fileSize_v3)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v3Device, fileSize_v3, ACL_MEM_MALLOC_HUGE_FIRST)); - - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ReadFile("./v3.bin", fileSize_v3, v3Host, fileSize_v3); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v3Device, fileSize_v3, v3Host, fileSize_v3, - ACL_MEMCPY_HOST_TO_DEVICE)); - - LaunchPstu_init_align_outside_loop_kernel_2d(v1Device, v2Device, v3Device, - stream); - - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v3Host, fileSize_v3, v3Device, fileSize_v3, - ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v3.bin", v3Host, fileSize_v3); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFree(v3Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - aclrtFreeHost(v3Host); - if (stream != nullptr) - aclrtDestroyStream(stream); - if (deviceSet) - aclrtResetDevice(deviceId); - if (aclInited) - aclFinalize(); - return rc; -} diff --git a/test/vpto/cases/micro-op/predicate-load-store/pstu-state-advance-boundary/compare.py b/test/vpto/cases/micro-op/predicate-load-store/pstu-state-advance-boundary/compare.py deleted file mode 100755 index bf213031a..000000000 --- a/test/vpto/cases/micro-op/predicate-load-store/pstu-state-advance-boundary/compare.py +++ /dev/null @@ -1,50 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/predicate-load-store/pstu-state-advance-boundary -# family: predicate-load-store -# target_ops: pto.pstu -# scenarios: unaligned-predicate-store, state-update, boundary, b16-mask, typed-ptr-b16 - -import os -import sys -import numpy as np - -EXPECTED_WORDS = 16 - - -def compare_packed_pred_mask(golden_path, output_path): - if not os.path.exists(golden_path) or not os.path.exists(output_path): - return False - golden = np.fromfile(golden_path, dtype=np.uint16) - output = np.fromfile(output_path, dtype=np.uint16) - if golden.size != EXPECTED_WORDS or output.size != EXPECTED_WORDS: - return False - if not np.array_equal(golden, output): - diff = np.nonzero(golden != output)[0] - idx = int(diff[0]) if diff.size else 0 - print(f"[ERROR] Mismatch (packed mask words): idx={idx} golden={int(golden[idx])} out={int(output[idx])}") - return False - return True - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_packed_pred_mask("golden_v3.bin", "v3.bin") - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/predicate-load-store/pstu-state-advance-boundary/golden.py b/test/vpto/cases/micro-op/predicate-load-store/pstu-state-advance-boundary/golden.py deleted file mode 100755 index f9db13980..000000000 --- a/test/vpto/cases/micro-op/predicate-load-store/pstu-state-advance-boundary/golden.py +++ /dev/null @@ -1,80 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/predicate-load-store/pstu-state-advance-boundary -# family: predicate-load-store -# target_ops: pto.pstu -# scenarios: unaligned-predicate-store, state-update, boundary, b16-mask, typed-ptr-b16 -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -ROWS = 32 -COLS = 32 -SEED = 19 -PACKED_BYTES_PER_STORE = 16 -OUTPUT_WORDS = 16 - - -def _pack_mask_b16(active_lanes: int) -> np.ndarray: - if active_lanes < 0 or active_lanes > 128: - raise ValueError(f"active_lanes must be in [0, 128], got {active_lanes}") - logical = np.zeros((128,), dtype=np.uint8) - logical[:active_lanes] = 1 - packed = np.packbits(logical, bitorder="little") - out = np.zeros((PACKED_BYTES_PER_STORE,), dtype=np.uint8) - out[: packed.size] = packed - return out - - -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) - - v1 = rng.uniform(-3.0, 3.0, size=(ROWS, COLS)).astype(np.float32) - v2 = rng.uniform(-1.0, 1.0, size=(ROWS, COLS)).astype(np.float32) - - first = _pack_mask_b16(1) - second = _pack_mask_b16(127) - packed = np.concatenate([first, second]).astype(np.uint8, copy=False) - output_init = np.zeros((OUTPUT_WORDS,), dtype=np.uint16) - - output_dir.mkdir(parents=True, exist_ok=True) - v1.reshape(-1).tofile(output_dir / "v1.bin") - v2.reshape(-1).tofile(output_dir / "v2.bin") - output_init.tofile(output_dir / "v3.bin") - packed.view(np.uint16).tofile(output_dir / "golden_v3.bin") - - -def main() -> None: - parser = argparse.ArgumentParser( - description="Generate numpy-based inputs/golden for VPTO micro-op pstu-state-advance-boundary validation." - ) - parser.add_argument( - "--output-dir", - type=Path, - default=Path("."), - help="Directory where v1.bin/v2.bin/v3.bin/golden_v3.bin are written.", - ) - parser.add_argument( - "--seed", - type=int, - default=SEED, - help="Numpy random seed.", - ) - args = parser.parse_args() - - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/predicate-load-store/pstu-state-advance-boundary/kernel.pto b/test/vpto/cases/micro-op/predicate-load-store/pstu-state-advance-boundary/kernel.pto deleted file mode 100644 index 69ab6c300..000000000 --- a/test/vpto/cases/micro-op/predicate-load-store/pstu-state-advance-boundary/kernel.pto +++ /dev/null @@ -1,41 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/predicate-load-store/pstu-state-advance-boundary -// family: predicate-load-store -// target_ops: pto.pstu -// scenarios: unaligned-predicate-store, state-update, boundary, b16-mask, typed-ptr-b16 -// ----------------------------------------------------------------------------- -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @pstu_state_advance_kernel_2d(%arg0: !pto.ptr, - %arg1: !pto.ptr, - %arg2: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c0_i32 = arith.constant 0 : i32 - %c1_i32 = arith.constant 1 : i32 - %c127 = arith.constant 127 : i32 - - %ub_mask = pto.castptr %c0_i64 : i64 -> !pto.ptr - - pto.vecscope { - scf.for %iter = %c0 to %c1 step %c1 { - %align0 = pto.init_align : !pto.align - %value0, %next0 = pto.plt_b16 %c1_i32 : i32 -> !pto.mask, i32 - %align1, %base1 = pto.pstu %align0, %value0, %ub_mask : !pto.align, !pto.mask, !pto.ptr -> !pto.align, !pto.ptr - %value1, %next1 = pto.plt_b16 %c127 : i32 -> !pto.mask, i32 - %align2, %base2 = pto.pstu %align1, %value1, %base1 : !pto.align, !pto.mask, !pto.ptr -> !pto.align, !pto.ptr - pto.vstas %align2, %base2, %c0_i32 : !pto.align, !pto.ptr, i32 - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_mask, %arg2, %c32_i64 - nburst(%c1_i64, %c32_i64, %c32_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/predicate-load-store/pstu-state-advance-boundary/launch.cpp b/test/vpto/cases/micro-op/predicate-load-store/pstu-state-advance-boundary/launch.cpp deleted file mode 100644 index 2c01b6ceb..000000000 --- a/test/vpto/cases/micro-op/predicate-load-store/pstu-state-advance-boundary/launch.cpp +++ /dev/null @@ -1,58 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/predicate-load-store/pstu-state-advance-boundary -// family: predicate-load-store -// target_ops: pto.pstu -// scenarios: unaligned-predicate-store, state-update, boundary, b16-mask, typed-ptr-b16 -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -// --------------------------------------------------------------------------- -// PTOAS compatibility layer -// --------------------------------------------------------------------------- -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include - -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void pstu_state_advance_kernel_2d(__gm__ float *v1, - __gm__ float *v2, - __gm__ uint16_t *v3); - -void LaunchPstu_state_advance_kernel_2d(float *v1, float *v2, uint16_t *v3, - void *stream) { - pstu_state_advance_kernel_2d<<<1, nullptr, stream>>>( - (__gm__ float *)v1, (__gm__ float *)v2, (__gm__ uint16_t *)v3); -} diff --git a/test/vpto/cases/micro-op/predicate-load-store/pstu-state-advance-boundary/main.cpp b/test/vpto/cases/micro-op/predicate-load-store/pstu-state-advance-boundary/main.cpp deleted file mode 100644 index 1f97fcc70..000000000 --- a/test/vpto/cases/micro-op/predicate-load-store/pstu-state-advance-boundary/main.cpp +++ /dev/null @@ -1,111 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/predicate-load-store/pstu-state-advance-boundary -// family: predicate-load-store -// target_ops: pto.pstu -// scenarios: unaligned-predicate-store, state-update, boundary, b16-mask, typed-ptr-b16 -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include - -using namespace PtoTestCommon; - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, \ - (int)_ret, __FILE__, __LINE__); \ - const char *_recent = aclGetRecentErrMsg(); \ - if (_recent != nullptr && _recent[0] != '\0') \ - std::fprintf(stderr, "[ERROR] RecentErrMsg: %s\n", _recent); \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchPstu_state_advance_kernel_2d(float *v1, float *v2, uint16_t *v3, - void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(float); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(float); - size_t elemCount_v3 = 16; - size_t fileSize_v3 = elemCount_v3 * sizeof(uint16_t); - float *v1Host = nullptr; - float *v1Device = nullptr; - float *v2Host = nullptr; - float *v2Device = nullptr; - uint16_t *v3Host = nullptr; - uint16_t *v3Device = nullptr; - - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) - deviceId = std::atoi(envDevice); - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMallocHost((void **)(&v3Host), fileSize_v3)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v3Device, fileSize_v3, ACL_MEM_MALLOC_HUGE_FIRST)); - - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ReadFile("./v3.bin", fileSize_v3, v3Host, fileSize_v3); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v3Device, fileSize_v3, v3Host, fileSize_v3, - ACL_MEMCPY_HOST_TO_DEVICE)); - - LaunchPstu_state_advance_kernel_2d(v1Device, v2Device, v3Device, stream); - - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v3Host, fileSize_v3, v3Device, fileSize_v3, - ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v3.bin", v3Host, fileSize_v3); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFree(v3Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - aclrtFreeHost(v3Host); - if (stream != nullptr) - aclrtDestroyStream(stream); - if (deviceSet) - aclrtResetDevice(deviceId); - if (aclInited) - aclFinalize(); - return rc; -} diff --git a/test/vpto/cases/micro-op/predicate-load-store/pstu/kernel.pto b/test/vpto/cases/micro-op/predicate-load-store/pstu/kernel.pto index f879fec94..65d53c4a0 100644 --- a/test/vpto/cases/micro-op/predicate-load-store/pstu/kernel.pto +++ b/test/vpto/cases/micro-op/predicate-load-store/pstu/kernel.pto @@ -1,42 +1,109 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/predicate-load-store/pstu -// family: predicate-load-store -// target_ops: pto.pstu -// scenarios: unaligned-predicate-store, state-update, representative-logical-elements -// ----------------------------------------------------------------------------- module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @pstu_kernel_2d(%arg0: !pto.ptr, - %arg1: !pto.ptr, - %arg2: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c8_i64 = arith.constant 8 : i64 - %c32_i64 = arith.constant 32 : i64 - %c0_i32 = arith.constant 0 : i32 - %c13 = arith.constant 13 : i32 - %c7 = arith.constant 7 : i32 - - %ub_mask = pto.castptr %c0_i64 : i64 -> !pto.ptr + func.func @pstu_deep_merged_kernel(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: !pto.ptr, %arg4: !pto.ptr, %arg5: !pto.ptr, %arg6: !pto.ptr, %arg7: !pto.ptr, %arg8: !pto.ptr) attributes {pto.kernel} { + %__deep_merge_guard = arith.constant false + // inactive merged from pstu_init_align_outside_loop_kernel_2d + scf.if %__deep_merge_guard { + + %c0_m0 = arith.constant 0 : index + %c1_m0 = arith.constant 1 : index + %c2_m0 = arith.constant 2 : index + %c0_i64_m0 = arith.constant 0 : i64 + %c1_i64_m0 = arith.constant 1 : i64 + %c8_i64_m0 = arith.constant 8 : i64 + %c32_i64_m0 = arith.constant 32 : i64 + %c0_i32_m0 = arith.constant 0 : i32 + %c13_m0 = arith.constant 13 : i32 + %c7_m0 = arith.constant 7 : i32 + + %ub_mask_m0 = pto.castptr %c0_i64_m0 : i64 -> !pto.ptr + + pto.vecscope { + %align_init_m0 = pto.init_align : !pto.align + %align_final_m0, %base_final_m0 = scf.for %iter_m0 = %c0_m0 to %c2_m0 step %c1_m0 + iter_args(%align_iter_m0 = %align_init_m0, %base_iter_m0 = %ub_mask_m0) + -> (!pto.align, !pto.ptr) { + %value_m0, %next_m0 = pto.plt_b32 %c13_m0 : i32 -> !pto.mask, i32 + %align_out_m0, %base_out_m0 = pto.pstu %align_iter_m0, %value_m0, %base_iter_m0 : !pto.align, !pto.mask, !pto.ptr -> !pto.align, !pto.ptr + %value_tail_m0, %next_tail_m0 = pto.plt_b32 %c7_m0 : i32 -> !pto.mask, i32 + %align_tail_m0, %base_tail_m0 = pto.pstu %align_out_m0, %value_tail_m0, %base_out_m0 : !pto.align, !pto.mask, !pto.ptr -> !pto.align, !pto.ptr + scf.yield %align_tail_m0, %base_tail_m0 : !pto.align, !pto.ptr + } + pto.vstas %align_final_m0, %base_final_m0, %c0_i32_m0 : !pto.align, !pto.ptr, i32 + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_mask_m0, %arg2, %c32_i64_m0 + nburst(%c1_i64_m0, %c32_i64_m0, %c32_i64_m0) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + + } + // inactive merged from pstu_state_advance_kernel_2d + scf.if %__deep_merge_guard { + + %c0_m1 = arith.constant 0 : index + %c1_m1 = arith.constant 1 : index + %c0_i64_m1 = arith.constant 0 : i64 + %c1_i64_m1 = arith.constant 1 : i64 + %c32_i64_m1 = arith.constant 32 : i64 + %c0_i32_m1 = arith.constant 0 : i32 + %c1_i32_m1 = arith.constant 1 : i32 + %c127_m1 = arith.constant 127 : i32 + + %ub_mask_m1 = pto.castptr %c0_i64_m1 : i64 -> !pto.ptr + + pto.vecscope { + scf.for %iter_m1 = %c0_m1 to %c1_m1 step %c1_m1 { + %align0_m1 = pto.init_align : !pto.align + %value0_m1, %next0_m1 = pto.plt_b16 %c1_i32_m1 : i32 -> !pto.mask, i32 + %align1_m1, %base1_m1 = pto.pstu %align0_m1, %value0_m1, %ub_mask_m1 : !pto.align, !pto.mask, !pto.ptr -> !pto.align, !pto.ptr + %value1_m1, %next1_m1 = pto.plt_b16 %c127_m1 : i32 -> !pto.mask, i32 + %align2_m1, %base2_m1 = pto.pstu %align1_m1, %value1_m1, %base1_m1 : !pto.align, !pto.mask, !pto.ptr -> !pto.align, !pto.ptr + pto.vstas %align2_m1, %base2_m1, %c0_i32_m1 : !pto.align, !pto.ptr, i32 + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_mask_m1, %arg5, %c32_i64_m1 + nburst(%c1_i64_m1, %c32_i64_m1, %c32_i64_m1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + + } + // active merged from pstu_kernel_2d + + %c0_m2 = arith.constant 0 : index + %c1_m2 = arith.constant 1 : index + %c0_i64_m2 = arith.constant 0 : i64 + %c1_i64_m2 = arith.constant 1 : i64 + %c8_i64_m2 = arith.constant 8 : i64 + %c32_i64_m2 = arith.constant 32 : i64 + %c0_i32_m2 = arith.constant 0 : i32 + %c13_m2 = arith.constant 13 : i32 + %c7_m2 = arith.constant 7 : i32 + + %ub_mask_m2 = pto.castptr %c0_i64_m2 : i64 -> !pto.ptr pto.vecscope { - scf.for %iter = %c0 to %c1 step %c1 { - %align = pto.init_align : !pto.align - %value, %next = pto.plt_b32 %c13 : i32 -> !pto.mask, i32 - %align_out, %base_out = pto.pstu %align, %value, %ub_mask : !pto.align, !pto.mask, !pto.ptr -> !pto.align, !pto.ptr - %value_tail, %next_tail = pto.plt_b32 %c7 : i32 -> !pto.mask, i32 - %align_tail, %base_tail = pto.pstu %align_out, %value_tail, %base_out : !pto.align, !pto.mask, !pto.ptr -> !pto.align, !pto.ptr - pto.vstas %align_tail, %base_tail, %c0_i32 : !pto.align, !pto.ptr, i32 + scf.for %iter_m2 = %c0_m2 to %c1_m2 step %c1_m2 { + %align_m2 = pto.init_align : !pto.align + %value_m2, %next_m2 = pto.plt_b32 %c13_m2 : i32 -> !pto.mask, i32 + %align_out_m2, %base_out_m2 = pto.pstu %align_m2, %value_m2, %ub_mask_m2 : !pto.align, !pto.mask, !pto.ptr -> !pto.align, !pto.ptr + %value_tail_m2, %next_tail_m2 = pto.plt_b32 %c7_m2 : i32 -> !pto.mask, i32 + %align_tail_m2, %base_tail_m2 = pto.pstu %align_out_m2, %value_tail_m2, %base_out_m2 : !pto.align, !pto.mask, !pto.ptr -> !pto.align, !pto.ptr + pto.vstas %align_tail_m2, %base_tail_m2, %c0_i32_m2 : !pto.align, !pto.ptr, i32 } } pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_mask, %arg2, %c32_i64 - nburst(%c1_i64, %c32_i64, %c32_i64) + pto.mte_ub_gm %ub_mask_m2, %arg8, %c32_i64_m2 + nburst(%c1_i64_m2, %c32_i64_m2, %c32_i64_m2) : !pto.ptr, !pto.ptr, i64, i64, i64, i64 pto.barrier #pto.pipe + return } } diff --git a/test/vpto/cases/micro-op/predicate-load-store/pstu/launch.cpp b/test/vpto/cases/micro-op/predicate-load-store/pstu/launch.cpp index 977155180..edfb72328 100644 --- a/test/vpto/cases/micro-op/predicate-load-store/pstu/launch.cpp +++ b/test/vpto/cases/micro-op/predicate-load-store/pstu/launch.cpp @@ -5,22 +5,9 @@ // THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, // INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. // See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/predicate-load-store/pstu -// family: predicate-load-store -// target_ops: pto.pstu -// scenarios: unaligned-predicate-store, state-update, representative-logical-elements -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -// --------------------------------------------------------------------------- -// PTOAS compatibility layer -// --------------------------------------------------------------------------- #ifndef __VEC_SCOPE__ #define __VEC_SCOPE__ #endif - #if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) typedef struct { unsigned char v; } hifloat8_t; typedef struct { unsigned char v; } float8_e4m3_t; @@ -30,28 +17,36 @@ typedef struct { unsigned char v; } float4_e1m2x2_t; typedef struct { unsigned char v; } float4_e2m1x2_t; #endif #include - #if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) #include #endif - #if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; +struct MrgSortExecutedNumList { uint16_t mrgSortList0,mrgSortList1,mrgSortList2,mrgSortList3; }; #endif #ifndef __CPU_SIM #include "acl/acl.h" #endif -extern "C" __global__ [aicore] void pstu_kernel_2d(__gm__ float *v1, - __gm__ float *v2, - __gm__ uint32_t *v3); +extern "C" __global__ [aicore] void pstu_deep_merged_kernel( + __gm__ float * arg0, + __gm__ float * arg1, + __gm__ uint32_t * arg2, + __gm__ float * arg3, + __gm__ float * arg4, + __gm__ uint16_t * arg5, + __gm__ float * arg6, + __gm__ float * arg7, + __gm__ uint32_t * arg8); -void LaunchPstu_kernel_2d(float *v1, float *v2, uint32_t *v3, void *stream) { - pstu_kernel_2d<<<1, nullptr, stream>>>((__gm__ float *)v1, (__gm__ float *)v2, - (__gm__ uint32_t *)v3); +void LaunchPstuDeepMerged(float * p0, float * p1, uint32_t * p2, void *stream) { + pstu_deep_merged_kernel<<<1, nullptr, stream>>>( + (__gm__ float *)p0, + (__gm__ float *)p0, + (__gm__ uint32_t *)p0, + (__gm__ float *)p0, + (__gm__ float *)p0, + (__gm__ uint16_t *)p0, + (__gm__ float *)p0, + (__gm__ float *)p1, + (__gm__ uint32_t *)p2); } diff --git a/test/vpto/cases/micro-op/predicate-load-store/pstu/main.cpp b/test/vpto/cases/micro-op/predicate-load-store/pstu/main.cpp index 5c31f323d..37d34159d 100644 --- a/test/vpto/cases/micro-op/predicate-load-store/pstu/main.cpp +++ b/test/vpto/cases/micro-op/predicate-load-store/pstu/main.cpp @@ -39,8 +39,8 @@ using namespace PtoTestCommon; } \ } while (0) -void LaunchPstu_kernel_2d(float *v1, float *v2, uint32_t *v3, void *stream); +void LaunchPstuDeepMerged(float * p0, float * p1, uint32_t * p2, void *stream); int main() { size_t elemCount_v1 = 1024; size_t fileSize_v1 = elemCount_v1 * sizeof(float); @@ -86,7 +86,7 @@ int main() { ACL_CHECK(aclrtMemcpy(v3Device, fileSize_v3, v3Host, fileSize_v3, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchPstu_kernel_2d(v1Device, v2Device, v3Device, stream); + LaunchPstuDeepMerged(v1Device, v2Device, v3Device, stream); ACL_CHECK(aclrtSynchronizeStream(stream)); ACL_CHECK(aclrtMemcpy(v3Host, fileSize_v3, v3Device, fileSize_v3, diff --git a/test/vpto/cases/micro-op/rearrangement/vintlv-vdintlv-lane-boundary/compare.py b/test/vpto/cases/micro-op/rearrangement/vintlv-vdintlv-lane-boundary/compare.py deleted file mode 100755 index f2a3e0459..000000000 --- a/test/vpto/cases/micro-op/rearrangement/vintlv-vdintlv-lane-boundary/compare.py +++ /dev/null @@ -1,209 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/rearrangement/vintlv-vdintlv-lane-boundary -# family: rearrangement -# target_ops: pto.vdintlv, pto.vintlv -# scenarios: paired-roundtrip, lane-order -# NOTE: bulk-generated coverage skeleton. -# coding=utf-8 - -import os -import sys -import numpy as np - - -REPEAT_BYTES = 256 - - -def _ceil_div(x, y): - return (x + y - 1) // y - - -def _packed_pred_storage_bytes(logical_elems, src_elem_bytes): - logical_elems = int(logical_elems) - src_elem_bytes = int(src_elem_bytes) - if logical_elems <= 0: - raise ValueError(f"logical_elems must be > 0, got {logical_elems}") - if src_elem_bytes not in (1, 2, 4): - raise ValueError(f"unsupported packed predicate source size: {src_elem_bytes}") - - repeat_elems = REPEAT_BYTES // src_elem_bytes - if src_elem_bytes == 4: - repeat_times = _ceil_div(logical_elems, repeat_elems) + 1 - loop_count = repeat_times // 2 - return loop_count * 16 - - repeat_times = _ceil_div(logical_elems, repeat_elems) - return repeat_times * (repeat_elems // 8) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - g = golden.astype(np.int64, copy=False) - o = output.astype(np.int64, copy=False) - else: - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - abs_diff = np.abs(g - o) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={g[idx]}, out={o[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_bin_prefix(golden_path, output_path, dtype, eps, count): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - try: - count = int(count) - except Exception: - print(f"[ERROR] Invalid prefix count: {count}") - return False - if count <= 0: - print(f"[ERROR] Invalid prefix count: {count}") - return False - - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np, count=count) - output = np.fromfile(output_path, dtype=dtype_np, count=count) - - if golden.size != count or output.size != count: - print( - f"[ERROR] Prefix read too small: need={count} elems, " - f"golden={golden.size}, out={output.size}" - ) - return False - - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - g = golden.astype(np.int64, copy=False) - o = output.astype(np.int64, copy=False) - else: - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - abs_diff = np.abs(g - o) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch (prefix): {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={g[idx]}, out={o[idx]}, dtype={dtype_np}, count={count})" - ) - else: - print(f"[ERROR] Mismatch (prefix): {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, logical_elems, src_elem_bytes): - """ - Compare outputs of pto.tcmp / pto.tcmps. - - PTO-ISA stores packed predicate results as a linear PK byte stream via - `psts`, with the exact written prefix length determined by the typed - TCMP/TCMPS repeat schedule. Compare only that semantic prefix. - """ - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - try: - logical_elems = int(logical_elems) - src_elem_bytes = int(src_elem_bytes) - except Exception: - print( - "[ERROR] Invalid packed mask compare arguments: " - f"logical_elems={logical_elems} src_elem_bytes={src_elem_bytes}" - ) - return False - if logical_elems <= 0 or src_elem_bytes <= 0: - print( - "[ERROR] Invalid packed mask compare arguments: " - f"logical_elems={logical_elems} src_elem_bytes={src_elem_bytes}" - ) - return False - - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - try: - prefix_bytes = _packed_pred_storage_bytes(logical_elems, src_elem_bytes) - except ValueError as exc: - print(f"[ERROR] {exc}") - return False - - if golden.size < prefix_bytes or output.size < prefix_bytes: - print( - f"[ERROR] Packed mask buffer too small: need={prefix_bytes} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - - golden_sel = golden[:prefix_bytes] - output_sel = output[:prefix_bytes] - - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, " - f"logical_elems={logical_elems}, src_elem_bytes={src_elem_bytes}, prefix_bytes={prefix_bytes})" - ) - return False - return True - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = True - ok = compare_bin("golden_v2.bin", "v2.bin", np.float32, 0.0001) and ok - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/rearrangement/vintlv-vdintlv-lane-boundary/golden.py b/test/vpto/cases/micro-op/rearrangement/vintlv-vdintlv-lane-boundary/golden.py deleted file mode 100755 index bc2746fab..000000000 --- a/test/vpto/cases/micro-op/rearrangement/vintlv-vdintlv-lane-boundary/golden.py +++ /dev/null @@ -1,54 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/rearrangement/vintlv-vdintlv-lane-boundary -# family: rearrangement -# target_ops: pto.vdintlv, pto.vintlv -# scenarios: paired-roundtrip, lane-order -# NOTE: paired vintlv+vdintlv roundtrip should recover the original input, including lane-boundary patterns. -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -ROWS = 32 -COLS = 32 -SEED = 19 - - -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) - flat = rng.uniform(-8.0, 8.0, size=ROWS * COLS).astype(np.float32) - for base in range(0, flat.size, 128): - flat[base + 62 : base + 66] = np.array([-62.0, -1.0, 1.0, 62.0], dtype=np.float32) - v1 = flat.reshape(ROWS, COLS) - v2 = np.zeros((ROWS, COLS), dtype=np.float32) - golden_v2 = v1.astype(np.float32, copy=True) - - output_dir.mkdir(parents=True, exist_ok=True) - v1.reshape(-1).tofile(output_dir / "v1.bin") - v2.reshape(-1).tofile(output_dir / "v2.bin") - golden_v2.reshape(-1).tofile(output_dir / "golden_v2.bin") - - -def main() -> None: - parser = argparse.ArgumentParser( - description="Generate numpy-based inputs/golden for VPTO micro-op vintlv+vdintlv lane-boundary validation." - ) - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/rearrangement/vintlv-vdintlv-lane-boundary/kernel.pto b/test/vpto/cases/micro-op/rearrangement/vintlv-vdintlv-lane-boundary/kernel.pto deleted file mode 100644 index 370672fc3..000000000 --- a/test/vpto/cases/micro-op/rearrangement/vintlv-vdintlv-lane-boundary/kernel.pto +++ /dev/null @@ -1,54 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/rearrangement/vintlv-vdintlv-lane-boundary -// family: rearrangement -// target_ops: pto.vdintlv, pto.vintlv -// scenarios: paired-roundtrip, lane-order, boundary -// ----------------------------------------------------------------------------- -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vintlv_vdintlv_boundary_kernel_2d(%arg0: !pto.ptr, %arg1: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c64 = arith.constant 64 : index - %c128 = arith.constant 128 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c128_i64 = arith.constant 128 : i64 - %c4096_i64 = arith.constant 4096 : i64 - %c1024_i32 = arith.constant 1024 : i32 - - %ub_in = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c4096_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_in, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %_:1 = scf.for %offset = %c0 to %c1024 step %c128 iter_args(%remaining = %c1024_i32) -> (i32) { - %mask, %next_remaining = pto.plt_b32 %remaining : i32 -> !pto.mask, i32 - %rhs_offset = arith.addi %offset, %c64 : index - %lhs = pto.vlds %ub_in[%offset] : !pto.ptr -> !pto.vreg<64xf32> - %rhs = pto.vlds %ub_in[%rhs_offset] : !pto.ptr -> !pto.vreg<64xf32> - %ilow, %ihigh = pto.vintlv %lhs, %rhs : !pto.vreg<64xf32>, !pto.vreg<64xf32> -> !pto.vreg<64xf32>, !pto.vreg<64xf32> - %dlow, %dhigh = pto.vdintlv %ilow, %ihigh : !pto.vreg<64xf32>, !pto.vreg<64xf32> -> !pto.vreg<64xf32>, !pto.vreg<64xf32> - pto.vsts %dlow, %ub_out[%offset], %mask : !pto.vreg<64xf32>, !pto.ptr, !pto.mask - pto.vsts %dhigh, %ub_out[%rhs_offset], %mask : !pto.vreg<64xf32>, !pto.ptr, !pto.mask - scf.yield %next_remaining : i32 - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg1, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/rearrangement/vintlv-vdintlv-lane-boundary/launch.cpp b/test/vpto/cases/micro-op/rearrangement/vintlv-vdintlv-lane-boundary/launch.cpp deleted file mode 100644 index f7bb8bf5a..000000000 --- a/test/vpto/cases/micro-op/rearrangement/vintlv-vdintlv-lane-boundary/launch.cpp +++ /dev/null @@ -1,71 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/rearrangement/vintlv-vdintlv-lane-boundary -// family: rearrangement -// target_ops: pto.vdintlv, pto.vintlv -// scenarios: paired-roundtrip, lane-order -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -// --------------------------------------------------------------------------- -// PTOAS compatibility layer -// -// The upstream pto-isa headers reference some FP8/FP4 types and the -// __VEC_SCOPE__ marker that are not available on every AICore arch/toolchain -// combination (e.g. __NPU_ARCH__==2201). -// -// For our PTOAS-generated kernels we don't rely on these types today, but the -// headers still mention them in templates/static_asserts. Provide minimal -// fallbacks to keep compilation working on dav-c220. -// --------------------------------------------------------------------------- -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include - -// AICore printf support is gated behind `--cce-enable-print` on some -// toolchains. When enabled, include the CCE print header so `cce::printf` -// resolves in device compilation. -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -// Some PTO-ISA types are only available in the __CCE_AICORE__ compilation -// path, but `bisheng -xcce` still performs a host-side parse pass. -// Provide minimal fallbacks only when the corresponding header wasn't -// pulled in by the selected arch implementation. -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vintlv_vdintlv_boundary_kernel_2d(__gm__ float *v1, - __gm__ float *v2); - -void LaunchVintlv_vdintlv_boundary_kernel_2d(float *v1, float *v2, void *stream) { - vintlv_vdintlv_boundary_kernel_2d<<<1, nullptr, stream>>>((__gm__ float *)v1, - (__gm__ float *)v2); -} diff --git a/test/vpto/cases/micro-op/rearrangement/vintlv-vdintlv-lane-boundary/main.cpp b/test/vpto/cases/micro-op/rearrangement/vintlv-vdintlv-lane-boundary/main.cpp deleted file mode 100644 index f6fb0606b..000000000 --- a/test/vpto/cases/micro-op/rearrangement/vintlv-vdintlv-lane-boundary/main.cpp +++ /dev/null @@ -1,130 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/rearrangement/vintlv-vdintlv-lane-boundary -// family: rearrangement -// target_ops: pto.vdintlv, pto.vintlv -// scenarios: paired-roundtrip, lane-order -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -This program is free software, you can redistribute it and/or modify it under the terms and conditions of -CANN Open Software License Agreement Version 2.0 (the "License"). -Please refer to the License for details. You may not use this file except in compliance with the License. -THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -See LICENSE in the root of the software repository for the full text of the License. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include -#include - -using namespace PtoTestCommon; - -#ifndef TMRGSORT_HPP -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, (int)_ret, __FILE__, __LINE__); \ - const char *_recent = aclGetRecentErrMsg(); \ - if (_recent != nullptr && _recent[0] != '\0') { \ - std::fprintf(stderr, "[ERROR] RecentErrMsg: %s\n", _recent); \ - } \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVintlv_vdintlv_boundary_kernel_2d(float *v1, float *v2, void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(float); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(float); - float *v1Host = nullptr; - float *v1Device = nullptr; - float *v2Host = nullptr; - float *v2Device = nullptr; - - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { - deviceId = std::atoi(envDevice); - } - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVintlv_vdintlv_boundary_kernel_2d(v1Device, v2Device, stream); - - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v2Host, fileSize_v2, v2Device, fileSize_v2, ACL_MEMCPY_DEVICE_TO_HOST)); - - WriteFile("./v2.bin", v2Host, fileSize_v2); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - if (stream != nullptr) { - const aclError _ret = aclrtDestroyStream(stream); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtDestroyStream(stream)", (int)_ret, __FILE__, __LINE__); - } - stream = nullptr; - } - if (deviceSet) { - const aclError _ret = aclrtResetDevice(deviceId); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtResetDevice(deviceId)", (int)_ret, __FILE__, __LINE__); - } - } - if (aclInited) { - const aclError _ret = aclFinalize(); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclFinalize()", (int)_ret, __FILE__, __LINE__); - } - } - - return rc; -} diff --git a/test/vpto/cases/micro-op/rearrangement/vintlv-vdintlv/kernel.pto b/test/vpto/cases/micro-op/rearrangement/vintlv-vdintlv/kernel.pto index 4dc0cc115..e7be144ea 100644 --- a/test/vpto/cases/micro-op/rearrangement/vintlv-vdintlv/kernel.pto +++ b/test/vpto/cases/micro-op/rearrangement/vintlv-vdintlv/kernel.pto @@ -1,54 +1,197 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/rearrangement/vintlv-vdintlv -// family: rearrangement -// target_ops: pto.vdintlv, pto.vintlv -// scenarios: paired-roundtrip, lane-order -// ----------------------------------------------------------------------------- module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vintlv_vdintlv_kernel_2d(%arg0: !pto.ptr, %arg1: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c64 = arith.constant 64 : index - %c128 = arith.constant 128 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c128_i64 = arith.constant 128 : i64 - %c4096_i64 = arith.constant 4096 : i64 - %c1024_i32 = arith.constant 1024 : i32 - - %ub_in = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c4096_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_in, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) + func.func @vintlv_vdintlv_deep_merged_kernel(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: !pto.ptr) attributes {pto.kernel} { + %__deep_merge_guard = arith.constant false + // inactive merged from vintlv_vdintlv_boundary_kernel_2d + scf.if %__deep_merge_guard { + + %c0_m0 = arith.constant 0 : index + %c1_m0 = arith.constant 1 : index + %c64_m0 = arith.constant 64 : index + %c128_m0 = arith.constant 128 : index + %c1024_m0 = arith.constant 1024 : index + %c0_i64_m0 = arith.constant 0 : i64 + %c1_i64_m0 = arith.constant 1 : i64 + %c32_i64_m0 = arith.constant 32 : i64 + %c128_i64_m0 = arith.constant 128 : i64 + %c4096_i64_m0 = arith.constant 4096 : i64 + %c1024_i32_m0 = arith.constant 1024 : i32 + + %ub_in_m0 = pto.castptr %c0_i64_m0 : i64 -> !pto.ptr + %ub_out_m0 = pto.castptr %c4096_i64_m0 : i64 -> !pto.ptr + + %false_m0 = arith.constant false + pto.mte_gm_ub %arg0, %ub_in_m0, %c0_i64_m0, %c128_i64_m0 + nburst(%c32_i64_m0, %c128_i64_m0, %c128_i64_m0) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %__m0:1 = scf.for %offset_m0 = %c0_m0 to %c1024_m0 step %c128_m0 iter_args(%remaining_m0 = %c1024_i32_m0) -> (i32) { + %mask_m0, %next_remaining_m0 = pto.plt_b32 %remaining_m0 : i32 -> !pto.mask, i32 + %rhs_offset_m0 = arith.addi %offset_m0, %c64_m0 : index + %lhs_m0 = pto.vlds %ub_in_m0[%offset_m0] : !pto.ptr -> !pto.vreg<64xf32> + %rhs_m0 = pto.vlds %ub_in_m0[%rhs_offset_m0] : !pto.ptr -> !pto.vreg<64xf32> + %ilow_m0, %ihigh_m0 = pto.vintlv %lhs_m0, %rhs_m0 : !pto.vreg<64xf32>, !pto.vreg<64xf32> -> !pto.vreg<64xf32>, !pto.vreg<64xf32> + %dlow_m0, %dhigh_m0 = pto.vdintlv %ilow_m0, %ihigh_m0 : !pto.vreg<64xf32>, !pto.vreg<64xf32> -> !pto.vreg<64xf32>, !pto.vreg<64xf32> + pto.vsts %dlow_m0, %ub_out_m0[%offset_m0], %mask_m0 : !pto.vreg<64xf32>, !pto.ptr, !pto.mask + pto.vsts %dhigh_m0, %ub_out_m0[%rhs_offset_m0], %mask_m0 : !pto.vreg<64xf32>, !pto.ptr, !pto.mask + scf.yield %next_remaining_m0 : i32 + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m0, %arg1, %c128_i64_m0 + nburst(%c32_i64_m0, %c128_i64_m0, %c128_i64_m0) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + + } + // active merged from vintlv_vdintlv_kernel_2d + + %c0_m1 = arith.constant 0 : index + %c1_m1 = arith.constant 1 : index + %c64_m1 = arith.constant 64 : index + %c128_m1 = arith.constant 128 : index + %c1024_m1 = arith.constant 1024 : index + %c0_i64_m1 = arith.constant 0 : i64 + %c1_i64_m1 = arith.constant 1 : i64 + %c32_i64_m1 = arith.constant 32 : i64 + %c128_i64_m1 = arith.constant 128 : i64 + %c4096_i64_m1 = arith.constant 4096 : i64 + %c1024_i32_m1 = arith.constant 1024 : i32 + + %ub_in_m1 = pto.castptr %c0_i64_m1 : i64 -> !pto.ptr + %ub_out_m1 = pto.castptr %c4096_i64_m1 : i64 -> !pto.ptr + + %false_m1 = arith.constant false + pto.mte_gm_ub %arg2, %ub_in_m1, %c0_i64_m1, %c128_i64_m1 + nburst(%c32_i64_m1, %c128_i64_m1, %c128_i64_m1) : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] pto.vecscope { - %_:1 = scf.for %offset = %c0 to %c1024 step %c128 iter_args(%remaining = %c1024_i32) -> (i32) { - %mask, %next_remaining = pto.plt_b32 %remaining : i32 -> !pto.mask, i32 - %rhs_offset = arith.addi %offset, %c64 : index - %lhs = pto.vlds %ub_in[%offset] : !pto.ptr -> !pto.vreg<64xf32> - %rhs = pto.vlds %ub_in[%rhs_offset] : !pto.ptr -> !pto.vreg<64xf32> - %ilow, %ihigh = pto.vintlv %lhs, %rhs : !pto.vreg<64xf32>, !pto.vreg<64xf32> -> !pto.vreg<64xf32>, !pto.vreg<64xf32> - %dlow, %dhigh = pto.vdintlv %ilow, %ihigh : !pto.vreg<64xf32>, !pto.vreg<64xf32> -> !pto.vreg<64xf32>, !pto.vreg<64xf32> - pto.vsts %dlow, %ub_out[%offset], %mask : !pto.vreg<64xf32>, !pto.ptr, !pto.mask - pto.vsts %dhigh, %ub_out[%rhs_offset], %mask : !pto.vreg<64xf32>, !pto.ptr, !pto.mask - scf.yield %next_remaining : i32 + %__m1:1 = scf.for %offset_m1 = %c0_m1 to %c1024_m1 step %c128_m1 iter_args(%remaining_m1 = %c1024_i32_m1) -> (i32) { + %mask_m1, %next_remaining_m1 = pto.plt_b32 %remaining_m1 : i32 -> !pto.mask, i32 + %rhs_offset_m1 = arith.addi %offset_m1, %c64_m1 : index + %lhs_m1 = pto.vlds %ub_in_m1[%offset_m1] : !pto.ptr -> !pto.vreg<64xf32> + %rhs_m1 = pto.vlds %ub_in_m1[%rhs_offset_m1] : !pto.ptr -> !pto.vreg<64xf32> + %ilow_m1, %ihigh_m1 = pto.vintlv %lhs_m1, %rhs_m1 : !pto.vreg<64xf32>, !pto.vreg<64xf32> -> !pto.vreg<64xf32>, !pto.vreg<64xf32> + %dlow_m1, %dhigh_m1 = pto.vdintlv %ilow_m1, %ihigh_m1 : !pto.vreg<64xf32>, !pto.vreg<64xf32> -> !pto.vreg<64xf32>, !pto.vreg<64xf32> + pto.vsts %dlow_m1, %ub_out_m1[%offset_m1], %mask_m1 : !pto.vreg<64xf32>, !pto.ptr, !pto.mask + pto.vsts %dhigh_m1, %ub_out_m1[%rhs_offset_m1], %mask_m1 : !pto.vreg<64xf32>, !pto.ptr, !pto.mask + scf.yield %next_remaining_m1 : i32 } } pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg1, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) + pto.mte_ub_gm %ub_out_m1, %arg3, %c128_i64_m1 + nburst(%c32_i64_m1, %c128_i64_m1, %c128_i64_m1) : !pto.ptr, !pto.ptr, i64, i64, i64, i64 pto.barrier #pto.pipe + %__case_merge_guard = arith.constant false + + // inactive cross-case merge from micro-op/rearrangement/vsqz + scf.if %__case_merge_guard { + + %__deep_merge_guard_cmg11_1 = arith.constant false + // inactive merged from vsqz_nontrivial_mask_kernel_2d + scf.if %__deep_merge_guard_cmg11_1 { + + %c0_m0_cmg11_1 = arith.constant 0 : index + %c1_m0_cmg11_1 = arith.constant 1 : index + %c64_m0_cmg11_1 = arith.constant 64 : index + %c1024_m0_cmg11_1 = arith.constant 1024 : index + %c0_i64_m0_cmg11_1 = arith.constant 0 : i64 + %c1_i64_m0_cmg11_1 = arith.constant 1 : i64 + %c64_i32_m0_cmg11_1 = arith.constant 64 : i32 + %c32_i64_m0_cmg11_1 = arith.constant 32 : i64 + %c128_i64_m0_cmg11_1 = arith.constant 128 : i64 + %c4096_i64_m0_cmg11_1 = arith.constant 4096 : i64 + %c8192_i64_m0_cmg11_1 = arith.constant 8192 : i64 + %zero_f32_m0_cmg11_1 = arith.constant 0.0 : f32 + + %ub_in_m0_cmg11_1 = pto.castptr %c0_i64_m0_cmg11_1 : i64 -> !pto.ptr + %ub_mask_seed_m0_cmg11_1 = pto.castptr %c4096_i64_m0_cmg11_1 : i64 -> !pto.ptr + %ub_out_m0_cmg11_1 = pto.castptr %c8192_i64_m0_cmg11_1 : i64 -> !pto.ptr + + %false_m0_cmg11_1 = arith.constant false + pto.mte_gm_ub %arg0, %ub_in_m0_cmg11_1, %c0_i64_m0_cmg11_1, %c128_i64_m0_cmg11_1 + nburst(%c32_i64_m0_cmg11_1, %c128_i64_m0_cmg11_1, %c128_i64_m0_cmg11_1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + pto.mte_gm_ub %arg1, %ub_mask_seed_m0_cmg11_1, %c0_i64_m0_cmg11_1, %c128_i64_m0_cmg11_1 + nburst(%c32_i64_m0_cmg11_1, %c128_i64_m0_cmg11_1, %c128_i64_m0_cmg11_1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + scf.for %offset_m0_cmg11_1 = %c0_m0_cmg11_1 to %c1024_m0_cmg11_1 step %c64_m0_cmg11_1 { + %store_mask_m0_cmg11_1, %unused_m0_cmg11_1 = pto.plt_b32 %c64_i32_m0_cmg11_1 : i32 -> !pto.mask, i32 + %vec_m0_cmg11_1 = pto.vlds %ub_in_m0_cmg11_1[%offset_m0_cmg11_1] : !pto.ptr -> !pto.vreg<64xf32> + %mask_seed_m0_cmg11_1 = pto.vlds %ub_mask_seed_m0_cmg11_1[%offset_m0_cmg11_1] : !pto.ptr -> !pto.vreg<64xf32> + %place_m0_cmg11_1 = pto.vcmps %mask_seed_m0_cmg11_1, %zero_f32_m0_cmg11_1, %store_mask_m0_cmg11_1, "gt" : !pto.vreg<64xf32>, f32, !pto.mask -> !pto.mask + %out_m0_cmg11_1 = pto.vsqz %vec_m0_cmg11_1, %place_m0_cmg11_1 : !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> + pto.vsts %out_m0_cmg11_1, %ub_out_m0_cmg11_1[%offset_m0_cmg11_1], %store_mask_m0_cmg11_1 : !pto.vreg<64xf32>, !pto.ptr, !pto.mask + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m0_cmg11_1, %arg1, %c128_i64_m0_cmg11_1 + nburst(%c32_i64_m0_cmg11_1, %c128_i64_m0_cmg11_1, %c128_i64_m0_cmg11_1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + + } + // active merged from vsqz_kernel_2d + + %c0_m1_cmg11_1 = arith.constant 0 : index + %c1_m1_cmg11_1 = arith.constant 1 : index + %c64_m1_cmg11_1 = arith.constant 64 : index + %c1024_m1_cmg11_1 = arith.constant 1024 : index + %c0_i64_m1_cmg11_1 = arith.constant 0 : i64 + %c1_i64_m1_cmg11_1 = arith.constant 1 : i64 + %c32_i64_m1_cmg11_1 = arith.constant 32 : i64 + %c128_i64_m1_cmg11_1 = arith.constant 128 : i64 + %c4096_i64_m1_cmg11_1 = arith.constant 4096 : i64 + %c1024_i32_m1_cmg11_1 = arith.constant 1024 : i32 + + %ub_in_m1_cmg11_1 = pto.castptr %c0_i64_m1_cmg11_1 : i64 -> !pto.ptr + %ub_out_m1_cmg11_1 = pto.castptr %c4096_i64_m1_cmg11_1 : i64 -> !pto.ptr + + %false_m1_cmg11_1 = arith.constant false + pto.mte_gm_ub %arg2, %ub_in_m1_cmg11_1, %c0_i64_m1_cmg11_1, %c128_i64_m1_cmg11_1 + nburst(%c32_i64_m1_cmg11_1, %c128_i64_m1_cmg11_1, %c128_i64_m1_cmg11_1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %__m1_cmg11_1:1 = scf.for %offset_m1_cmg11_1 = %c0_m1_cmg11_1 to %c1024_m1_cmg11_1 step %c64_m1_cmg11_1 iter_args(%remaining_m1_cmg11_1 = %c1024_i32_m1_cmg11_1) -> (i32) { + %mask_m1_cmg11_1, %next_remaining_m1_cmg11_1 = pto.plt_b32 %remaining_m1_cmg11_1 : i32 -> !pto.mask, i32 + %vec_m1_cmg11_1 = pto.vlds %ub_in_m1_cmg11_1[%offset_m1_cmg11_1] : !pto.ptr -> !pto.vreg<64xf32> + %out_m1_cmg11_1 = pto.vsqz %vec_m1_cmg11_1, %mask_m1_cmg11_1 : !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> + pto.vsts %out_m1_cmg11_1, %ub_out_m1_cmg11_1[%offset_m1_cmg11_1], %mask_m1_cmg11_1 : !pto.vreg<64xf32>, !pto.ptr, !pto.mask + scf.yield %next_remaining_m1_cmg11_1 : i32 + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m1_cmg11_1, %arg3, %c128_i64_m1_cmg11_1 + nburst(%c32_i64_m1_cmg11_1, %c128_i64_m1_cmg11_1, %c128_i64_m1_cmg11_1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + } + return } } diff --git a/test/vpto/cases/micro-op/rearrangement/vintlv-vdintlv/launch.cpp b/test/vpto/cases/micro-op/rearrangement/vintlv-vdintlv/launch.cpp index 27baf3164..91f0a1326 100644 --- a/test/vpto/cases/micro-op/rearrangement/vintlv-vdintlv/launch.cpp +++ b/test/vpto/cases/micro-op/rearrangement/vintlv-vdintlv/launch.cpp @@ -5,30 +5,9 @@ // THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, // INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. // See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/rearrangement/vintlv-vdintlv -// family: rearrangement -// target_ops: pto.vdintlv, pto.vintlv -// scenarios: paired-roundtrip, lane-order -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -// --------------------------------------------------------------------------- -// PTOAS compatibility layer -// -// The upstream pto-isa headers reference some FP8/FP4 types and the -// __VEC_SCOPE__ marker that are not available on every AICore arch/toolchain -// combination (e.g. __NPU_ARCH__==2201). -// -// For our PTOAS-generated kernels we don't rely on these types today, but the -// headers still mention them in templates/static_asserts. Provide minimal -// fallbacks to keep compilation working on dav-c220. -// --------------------------------------------------------------------------- #ifndef __VEC_SCOPE__ #define __VEC_SCOPE__ #endif - #if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) typedef struct { unsigned char v; } hifloat8_t; typedef struct { unsigned char v; } float8_e4m3_t; @@ -38,34 +17,26 @@ typedef struct { unsigned char v; } float4_e1m2x2_t; typedef struct { unsigned char v; } float4_e2m1x2_t; #endif #include - -// AICore printf support is gated behind `--cce-enable-print` on some -// toolchains. When enabled, include the CCE print header so `cce::printf` -// resolves in device compilation. #if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) #include #endif - -// Some PTO-ISA types are only available in the __CCE_AICORE__ compilation -// path, but `bisheng -xcce` still performs a host-side parse pass. -// Provide minimal fallbacks only when the corresponding header wasn't -// pulled in by the selected arch implementation. #if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; +struct MrgSortExecutedNumList { uint16_t mrgSortList0,mrgSortList1,mrgSortList2,mrgSortList3; }; #endif #ifndef __CPU_SIM #include "acl/acl.h" #endif -extern "C" __global__ [aicore] void vintlv_vdintlv_kernel_2d(__gm__ float *v1, - __gm__ float *v2); +extern "C" __global__ [aicore] void vintlv_vdintlv_deep_merged_kernel( + __gm__ float * arg0, + __gm__ float * arg1, + __gm__ float * arg2, + __gm__ float * arg3); -void LaunchVintlv_vdintlv_kernel_2d(float *v1, float *v2, void *stream) { - vintlv_vdintlv_kernel_2d<<<1, nullptr, stream>>>((__gm__ float *)v1, - (__gm__ float *)v2); +void LaunchVintlvVdintlvDeepMerged(float * p0, float * p1, void *stream) { + vintlv_vdintlv_deep_merged_kernel<<<1, nullptr, stream>>>( + (__gm__ float *)p0, + (__gm__ float *)p0, + (__gm__ float *)p0, + (__gm__ float *)p1); } diff --git a/test/vpto/cases/micro-op/rearrangement/vintlv-vdintlv/main.cpp b/test/vpto/cases/micro-op/rearrangement/vintlv-vdintlv/main.cpp index 0a66ddb10..faa41efed 100644 --- a/test/vpto/cases/micro-op/rearrangement/vintlv-vdintlv/main.cpp +++ b/test/vpto/cases/micro-op/rearrangement/vintlv-vdintlv/main.cpp @@ -55,8 +55,8 @@ struct MrgSortExecutedNumList { } \ } while (0) -void LaunchVintlv_vdintlv_kernel_2d(float *v1, float *v2, void *stream); +void LaunchVintlvVdintlvDeepMerged(float * p0, float * p1, void *stream); int main() { size_t elemCount_v1 = 1024; size_t fileSize_v1 = elemCount_v1 * sizeof(float); @@ -91,7 +91,7 @@ int main() { ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, ACL_MEMCPY_HOST_TO_DEVICE)); ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVintlv_vdintlv_kernel_2d(v1Device, v2Device, stream); + LaunchVintlvVdintlvDeepMerged(v1Device, v2Device, stream); ACL_CHECK(aclrtSynchronizeStream(stream)); ACL_CHECK(aclrtMemcpy(v2Host, fileSize_v2, v2Device, fileSize_v2, ACL_MEMCPY_DEVICE_TO_HOST)); diff --git a/test/vpto/cases/micro-op/rearrangement/vpack-higher/kernel.pto b/test/vpto/cases/micro-op/rearrangement/vpack-higher/kernel.pto index cb86876bf..206b55672 100644 --- a/test/vpto/cases/micro-op/rearrangement/vpack-higher/kernel.pto +++ b/test/vpto/cases/micro-op/rearrangement/vpack-higher/kernel.pto @@ -46,6 +46,51 @@ module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind, !pto.ptr, i64, i64, i64, i64 pto.barrier #pto.pipe + %__case_merge_guard = arith.constant false + + // inactive cross-case merge from micro-op/rearrangement/vpack-lower + scf.if %__case_merge_guard { + + %c0_cmg12_1 = arith.constant 0 : index + %c2_cmg12_1 = arith.constant 2 : index + %c64_cmg12_1 = arith.constant 64 : index + %c1024_cmg12_1 = arith.constant 1024 : index + %c0_i64_cmg12_1 = arith.constant 0 : i64 + %c1_i64_cmg12_1 = arith.constant 1 : i64 + %c1_i16_cmg12_1 = arith.constant 1 : i16 + %c32_i64_cmg12_1 = arith.constant 32 : i64 + %c128_i64_cmg12_1 = arith.constant 128 : i64 + %c4096_i64_cmg12_1 = arith.constant 4096 : i64 + + %ub_in_cmg12_1 = pto.castptr %c0_i64_cmg12_1 : i64 -> !pto.ptr + %ub_out_cmg12_1 = pto.castptr %c4096_i64_cmg12_1 : i64 -> !pto.ptr + + %false_cmg12_1 = arith.constant false + pto.mte_gm_ub %arg0, %ub_in_cmg12_1, %c0_i64_cmg12_1, %c128_i64_cmg12_1 + nburst(%c32_i64_cmg12_1, %c128_i64_cmg12_1, %c128_i64_cmg12_1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %store_mask_cmg12_1 = pto.pset_b16 "PAT_ALL" : !pto.mask + scf.for %src_offset_cmg12_1 = %c0_cmg12_1 to %c1024_cmg12_1 step %c64_cmg12_1 { + %dst_offset_cmg12_1 = arith.muli %src_offset_cmg12_1, %c2_cmg12_1 : index + %vec_cmg12_1 = pto.vlds %ub_in_cmg12_1[%src_offset_cmg12_1] : !pto.ptr -> !pto.vreg<64xi32> + %packed_cmg12_1 = pto.vpack %vec_cmg12_1, "LOWER" : !pto.vreg<64xi32> -> !pto.vreg<128xui16> + %observed_cmg12_1 = pto.vadds %packed_cmg12_1, %c1_i16_cmg12_1, %store_mask_cmg12_1 : !pto.vreg<128xui16>, i16, !pto.mask -> !pto.vreg<128xui16> + pto.vsts %observed_cmg12_1, %ub_out_cmg12_1[%dst_offset_cmg12_1], %store_mask_cmg12_1 : !pto.vreg<128xui16>, !pto.ptr, !pto.mask + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_cmg12_1, %arg1, %c128_i64_cmg12_1 + nburst(%c32_i64_cmg12_1, %c128_i64_cmg12_1, %c128_i64_cmg12_1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + } return } } diff --git a/test/vpto/cases/micro-op/rearrangement/vpack-lower/compare.py b/test/vpto/cases/micro-op/rearrangement/vpack-lower/compare.py deleted file mode 100644 index 0caf7195c..000000000 --- a/test/vpto/cases/micro-op/rearrangement/vpack-lower/compare.py +++ /dev/null @@ -1,42 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/rearrangement/vpack-lower -# family: rearrangement -# target_ops: pto.vpack -# scenarios: narrowing, lower-half-placement, zero-fill-upper-half -# coding=utf-8 -import os -import sys - -import numpy as np - - -def compare_bin(golden_path, output_path, dtype): - if not os.path.exists(golden_path) or not os.path.exists(output_path): - return False - golden = np.fromfile(golden_path, dtype=dtype) - output = np.fromfile(output_path, dtype=dtype) - return golden.shape == output.shape and np.array_equal(golden, output) - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_bin("golden_v2.bin", "v2.bin", np.uint16) - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/rearrangement/vpack-lower/golden.py b/test/vpto/cases/micro-op/rearrangement/vpack-lower/golden.py deleted file mode 100644 index 37ca69a25..000000000 --- a/test/vpto/cases/micro-op/rearrangement/vpack-lower/golden.py +++ /dev/null @@ -1,63 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/rearrangement/vpack-lower -# family: rearrangement -# target_ops: pto.vpack -# scenarios: narrowing, lower-half-placement, zero-fill-upper-half, post-pack-consumer -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -ROWS = 32 -COLS = 32 -ELEMS = ROWS * COLS -CHUNK = 64 -OUTPUT_ELEMS = ELEMS * 2 -SEED = 19 -BIAS = np.uint16(1) - - -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) - v1 = rng.integers(-(1 << 20), 1 << 20, size=ELEMS, dtype=np.int32) - v2 = np.zeros(OUTPUT_ELEMS, dtype=np.uint16) - golden_v2 = np.zeros(OUTPUT_ELEMS, dtype=np.uint16) - - narrowed = v1.astype(np.uint16, copy=False) - for chunk_base in range(0, ELEMS, CHUNK): - chunk = narrowed[chunk_base : chunk_base + CHUNK] - out_base = (chunk_base // CHUNK) * (CHUNK * 2) - golden_v2[out_base : out_base + CHUNK] = ( - chunk.astype(np.uint32) + int(BIAS) - ).astype(np.uint16) - golden_v2[out_base + CHUNK : out_base + 2 * CHUNK] = BIAS - - output_dir.mkdir(parents=True, exist_ok=True) - v1.tofile(output_dir / "v1.bin") - v2.tofile(output_dir / "v2.bin") - golden_v2.tofile(output_dir / "golden_v2.bin") - - -def main() -> None: - parser = argparse.ArgumentParser( - description="Generate numpy-based inputs/golden for VPTO micro-op vpack-lower validation." - ) - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/rearrangement/vpack-lower/kernel.pto b/test/vpto/cases/micro-op/rearrangement/vpack-lower/kernel.pto deleted file mode 100644 index 10294f0f7..000000000 --- a/test/vpto/cases/micro-op/rearrangement/vpack-lower/kernel.pto +++ /dev/null @@ -1,51 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/rearrangement/vpack-lower -// family: rearrangement -// target_ops: pto.vpack -// scenarios: narrowing, lower-half-placement, zero-fill-upper-half, post-pack-consumer -// ----------------------------------------------------------------------------- - -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vpack_lower_kernel_2d(%arg0: !pto.ptr, %arg1: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c2 = arith.constant 2 : index - %c64 = arith.constant 64 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c1_i16 = arith.constant 1 : i16 - %c32_i64 = arith.constant 32 : i64 - %c128_i64 = arith.constant 128 : i64 - %c4096_i64 = arith.constant 4096 : i64 - - %ub_in = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c4096_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_in, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %store_mask = pto.pset_b16 "PAT_ALL" : !pto.mask - scf.for %src_offset = %c0 to %c1024 step %c64 { - %dst_offset = arith.muli %src_offset, %c2 : index - %vec = pto.vlds %ub_in[%src_offset] : !pto.ptr -> !pto.vreg<64xi32> - %packed = pto.vpack %vec, "LOWER" : !pto.vreg<64xi32> -> !pto.vreg<128xui16> - %observed = pto.vadds %packed, %c1_i16, %store_mask : !pto.vreg<128xui16>, i16, !pto.mask -> !pto.vreg<128xui16> - pto.vsts %observed, %ub_out[%dst_offset], %store_mask : !pto.vreg<128xui16>, !pto.ptr, !pto.mask - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg1, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/rearrangement/vpack-lower/launch.cpp b/test/vpto/cases/micro-op/rearrangement/vpack-lower/launch.cpp deleted file mode 100644 index 3bf8b0da1..000000000 --- a/test/vpto/cases/micro-op/rearrangement/vpack-lower/launch.cpp +++ /dev/null @@ -1,51 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/rearrangement/vpack-lower -// family: rearrangement -// target_ops: pto.vpack -// scenarios: narrowing, lower-half-placement, zero-fill-upper-half -// ----------------------------------------------------------------------------- -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include - -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vpack_lower_kernel_2d(__gm__ int *v1, - __gm__ uint16_t *v2); - -void LaunchVpack_lower_kernel_2d(int32_t *v1, uint16_t *v2, void *stream) { - vpack_lower_kernel_2d<<<1, nullptr, stream>>>((__gm__ int *)v1, - (__gm__ uint16_t *)v2); -} diff --git a/test/vpto/cases/micro-op/rearrangement/vpack-lower/main.cpp b/test/vpto/cases/micro-op/rearrangement/vpack-lower/main.cpp deleted file mode 100644 index 5cc58448c..000000000 --- a/test/vpto/cases/micro-op/rearrangement/vpack-lower/main.cpp +++ /dev/null @@ -1,128 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/rearrangement/vpack-lower -// family: rearrangement -// target_ops: pto.vpack -// scenarios: narrowing, lower-half-placement, zero-fill-upper-half -// ----------------------------------------------------------------------------- -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -This program is free software, you can redistribute it and/or modify it under the terms and conditions of -CANN Open Software License Agreement Version 2.0 (the "License"). -Please refer to the License for details. You may not use this file except in compliance with the License. -THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -See LICENSE in the root of the software repository for the full text of the License. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include -#include - -using namespace PtoTestCommon; - -#ifndef TMRGSORT_HPP -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, (int)_ret, __FILE__, __LINE__); \ - const char *_recent = aclGetRecentErrMsg(); \ - if (_recent != nullptr && _recent[0] != '\0') { \ - std::fprintf(stderr, "[ERROR] RecentErrMsg: %s\n", _recent); \ - } \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVpack_lower_kernel_2d(int32_t *v1, uint16_t *v2, void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(int32_t); - size_t elemCount_v2 = 2048; - size_t fileSize_v2 = elemCount_v2 * sizeof(uint16_t); - int32_t *v1Host = nullptr; - int32_t *v1Device = nullptr; - uint16_t *v2Host = nullptr; - uint16_t *v2Device = nullptr; - - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { - deviceId = std::atoi(envDevice); - } - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVpack_lower_kernel_2d(v1Device, v2Device, stream); - - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v2Host, fileSize_v2, v2Device, fileSize_v2, ACL_MEMCPY_DEVICE_TO_HOST)); - - WriteFile("./v2.bin", v2Host, fileSize_v2); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - if (stream != nullptr) { - const aclError _ret = aclrtDestroyStream(stream); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtDestroyStream(stream)", (int)_ret, __FILE__, __LINE__); - } - stream = nullptr; - } - if (deviceSet) { - const aclError _ret = aclrtResetDevice(deviceId); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtResetDevice(deviceId)", (int)_ret, __FILE__, __LINE__); - } - } - if (aclInited) { - const aclError _ret = aclFinalize(); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclFinalize()", (int)_ret, __FILE__, __LINE__); - } - } - - return rc; -} diff --git a/test/vpto/cases/micro-op/rearrangement/vsqz-nontrivial-mask/compare.py b/test/vpto/cases/micro-op/rearrangement/vsqz-nontrivial-mask/compare.py deleted file mode 100755 index 4cca2c574..000000000 --- a/test/vpto/cases/micro-op/rearrangement/vsqz-nontrivial-mask/compare.py +++ /dev/null @@ -1,208 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/rearrangement/vsqz-nontrivial-mask -# family: rearrangement -# target_ops: pto.vsqz -# scenarios: predicate-driven-rearrangement, stable-order, nontrivial-mask -# coding=utf-8 - -import os -import sys -import numpy as np - - -REPEAT_BYTES = 256 - - -def _ceil_div(x, y): - return (x + y - 1) // y - - -def _packed_pred_storage_bytes(logical_elems, src_elem_bytes): - logical_elems = int(logical_elems) - src_elem_bytes = int(src_elem_bytes) - if logical_elems <= 0: - raise ValueError(f"logical_elems must be > 0, got {logical_elems}") - if src_elem_bytes not in (1, 2, 4): - raise ValueError(f"unsupported packed predicate source size: {src_elem_bytes}") - - repeat_elems = REPEAT_BYTES // src_elem_bytes - if src_elem_bytes == 4: - repeat_times = _ceil_div(logical_elems, repeat_elems) + 1 - loop_count = repeat_times // 2 - return loop_count * 16 - - repeat_times = _ceil_div(logical_elems, repeat_elems) - return repeat_times * (repeat_elems // 8) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - g = golden.astype(np.int64, copy=False) - o = output.astype(np.int64, copy=False) - else: - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - abs_diff = np.abs(g - o) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={g[idx]}, out={o[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_bin_prefix(golden_path, output_path, dtype, eps, count): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - try: - count = int(count) - except Exception: - print(f"[ERROR] Invalid prefix count: {count}") - return False - if count <= 0: - print(f"[ERROR] Invalid prefix count: {count}") - return False - - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np, count=count) - output = np.fromfile(output_path, dtype=dtype_np, count=count) - - if golden.size != count or output.size != count: - print( - f"[ERROR] Prefix read too small: need={count} elems, " - f"golden={golden.size}, out={output.size}" - ) - return False - - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - g = golden.astype(np.int64, copy=False) - o = output.astype(np.int64, copy=False) - else: - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - abs_diff = np.abs(g - o) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch (prefix): {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={g[idx]}, out={o[idx]}, dtype={dtype_np}, count={count})" - ) - else: - print(f"[ERROR] Mismatch (prefix): {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, logical_elems, src_elem_bytes): - """ - Compare outputs of pto.tcmp / pto.tcmps. - - PTO-ISA stores packed predicate results as a linear PK byte stream via - `psts`, with the exact written prefix length determined by the typed - TCMP/TCMPS repeat schedule. Compare only that semantic prefix. - """ - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - try: - logical_elems = int(logical_elems) - src_elem_bytes = int(src_elem_bytes) - except Exception: - print( - "[ERROR] Invalid packed mask compare arguments: " - f"logical_elems={logical_elems} src_elem_bytes={src_elem_bytes}" - ) - return False - if logical_elems <= 0 or src_elem_bytes <= 0: - print( - "[ERROR] Invalid packed mask compare arguments: " - f"logical_elems={logical_elems} src_elem_bytes={src_elem_bytes}" - ) - return False - - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - try: - prefix_bytes = _packed_pred_storage_bytes(logical_elems, src_elem_bytes) - except ValueError as exc: - print(f"[ERROR] {exc}") - return False - - if golden.size < prefix_bytes or output.size < prefix_bytes: - print( - f"[ERROR] Packed mask buffer too small: need={prefix_bytes} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - - golden_sel = golden[:prefix_bytes] - output_sel = output[:prefix_bytes] - - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, " - f"logical_elems={logical_elems}, src_elem_bytes={src_elem_bytes}, prefix_bytes={prefix_bytes})" - ) - return False - return True - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = True - ok = compare_bin("golden_v2.bin", "v2.bin", np.float32, 0.0001) and ok - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/rearrangement/vsqz-nontrivial-mask/golden.py b/test/vpto/cases/micro-op/rearrangement/vsqz-nontrivial-mask/golden.py deleted file mode 100755 index e7d456a40..000000000 --- a/test/vpto/cases/micro-op/rearrangement/vsqz-nontrivial-mask/golden.py +++ /dev/null @@ -1,59 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/rearrangement/vsqz-nontrivial-mask -# family: rearrangement -# target_ops: pto.vsqz -# scenarios: predicate-driven-rearrangement, stable-order, nontrivial-mask -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -ROWS = 32 -COLS = 32 -LANES = 64 -BLOCKS = ROWS * COLS // LANES -ACTIVE_POSITIONS = [1, 4, 5, 9, 12, 16, 21, 24, 29, 33, 36, 40, 45, 49, 54, 60] -SEED = 19 - - -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) - values = rng.uniform(-8.0, 8.0, size=(BLOCKS, LANES)).astype(np.float32) - mask_seed = np.full((BLOCKS, LANES), -1.0, dtype=np.float32) - golden = np.zeros((BLOCKS, LANES), dtype=np.float32) - - for block in range(BLOCKS): - for pos in ACTIVE_POSITIONS: - mask_seed[block, pos] = 1.0 - kept = values[block, ACTIVE_POSITIONS] - golden[block, :kept.size] = kept - - output_dir.mkdir(parents=True, exist_ok=True) - values.reshape(-1).tofile(output_dir / "v1.bin") - mask_seed.reshape(-1).tofile(output_dir / "v2.bin") - golden.reshape(-1).tofile(output_dir / "golden_v2.bin") - - -def main() -> None: - parser = argparse.ArgumentParser( - description="Generate nontrivial-mask inputs/golden for VPTO micro-op vsqz validation." - ) - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/rearrangement/vsqz-nontrivial-mask/kernel.pto b/test/vpto/cases/micro-op/rearrangement/vsqz-nontrivial-mask/kernel.pto deleted file mode 100644 index b2e00d2df..000000000 --- a/test/vpto/cases/micro-op/rearrangement/vsqz-nontrivial-mask/kernel.pto +++ /dev/null @@ -1,65 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/rearrangement/vsqz-nontrivial-mask -// family: rearrangement -// target_ops: pto.vsqz -// scenarios: predicate-driven-rearrangement, stable-order, nontrivial-mask -// ----------------------------------------------------------------------------- -// Validate nontrivial predicate-driven compaction: -// - arg0 provides input values. -// - arg1 provides a mask seed (positive => keep lane; non-positive => drop lane) -// and receives the compacted output. -// For each 64-lane chunk: -// 1. Build placement mask via vcmps(mask_seed > 0). -// 2. Run vsqz using that placement mask. -// 3. Store full compacted vector (kept lanes first, tail zeroed) back to UB. - -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vsqz_nontrivial_mask_kernel_2d(%arg0: !pto.ptr, %arg1: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c64 = arith.constant 64 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c64_i32 = arith.constant 64 : i32 - %c32_i64 = arith.constant 32 : i64 - %c128_i64 = arith.constant 128 : i64 - %c4096_i64 = arith.constant 4096 : i64 - %c8192_i64 = arith.constant 8192 : i64 - %zero_f32 = arith.constant 0.0 : f32 - - %ub_in = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_mask_seed = pto.castptr %c4096_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c8192_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_in, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - pto.mte_gm_ub %arg1, %ub_mask_seed, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - scf.for %offset = %c0 to %c1024 step %c64 { - %store_mask, %unused = pto.plt_b32 %c64_i32 : i32 -> !pto.mask, i32 - %vec = pto.vlds %ub_in[%offset] : !pto.ptr -> !pto.vreg<64xf32> - %mask_seed = pto.vlds %ub_mask_seed[%offset] : !pto.ptr -> !pto.vreg<64xf32> - %place = pto.vcmps %mask_seed, %zero_f32, %store_mask, "gt" : !pto.vreg<64xf32>, f32, !pto.mask -> !pto.mask - %out = pto.vsqz %vec, %place : !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> - pto.vsts %out, %ub_out[%offset], %store_mask : !pto.vreg<64xf32>, !pto.ptr, !pto.mask - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg1, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/rearrangement/vsqz-nontrivial-mask/launch.cpp b/test/vpto/cases/micro-op/rearrangement/vsqz-nontrivial-mask/launch.cpp deleted file mode 100644 index be43cc98f..000000000 --- a/test/vpto/cases/micro-op/rearrangement/vsqz-nontrivial-mask/launch.cpp +++ /dev/null @@ -1,70 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/rearrangement/vsqz-nontrivial-mask -// family: rearrangement -// target_ops: pto.vsqz -// scenarios: predicate-driven-rearrangement, stable-order -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -// --------------------------------------------------------------------------- -// PTOAS compatibility layer -// -// The upstream pto-isa headers reference some FP8/FP4 types and the -// __VEC_SCOPE__ marker that are not available on every AICore arch/toolchain -// combination (e.g. __NPU_ARCH__==2201). -// -// For our PTOAS-generated kernels we don't rely on these types today, but the -// headers still mention them in templates/static_asserts. Provide minimal -// fallbacks to keep compilation working on dav-c220. -// --------------------------------------------------------------------------- -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include - -// AICore printf support is gated behind `--cce-enable-print` on some -// toolchains. When enabled, include the CCE print header so `cce::printf` -// resolves in device compilation. -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -// Some PTO-ISA types are only available in the __CCE_AICORE__ compilation -// path, but `bisheng -xcce` still performs a host-side parse pass. -// Provide minimal fallbacks only when the corresponding header wasn't -// pulled in by the selected arch implementation. -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vsqz_nontrivial_mask_kernel_2d(__gm__ float *v1, - __gm__ float *v2); - -void LaunchVsqzNontrivialMask_kernel_2d(float *v1, float *v2, void *stream) { - vsqz_nontrivial_mask_kernel_2d<<<1, nullptr, stream>>>((__gm__ float *)v1, (__gm__ float *)v2); -} diff --git a/test/vpto/cases/micro-op/rearrangement/vsqz-nontrivial-mask/main.cpp b/test/vpto/cases/micro-op/rearrangement/vsqz-nontrivial-mask/main.cpp deleted file mode 100644 index 8d467ea02..000000000 --- a/test/vpto/cases/micro-op/rearrangement/vsqz-nontrivial-mask/main.cpp +++ /dev/null @@ -1,130 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/rearrangement/vsqz-nontrivial-mask -// family: rearrangement -// target_ops: pto.vsqz -// scenarios: predicate-driven-rearrangement, stable-order -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -This program is free software, you can redistribute it and/or modify it under the terms and conditions of -CANN Open Software License Agreement Version 2.0 (the "License"). -Please refer to the License for details. You may not use this file except in compliance with the License. -THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -See LICENSE in the root of the software repository for the full text of the License. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include -#include - -using namespace PtoTestCommon; - -#ifndef TMRGSORT_HPP -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, (int)_ret, __FILE__, __LINE__); \ - const char *_recent = aclGetRecentErrMsg(); \ - if (_recent != nullptr && _recent[0] != '\0') { \ - std::fprintf(stderr, "[ERROR] RecentErrMsg: %s\n", _recent); \ - } \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVsqzNontrivialMask_kernel_2d(float *v1, float *v2, void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(float); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(float); - float *v1Host = nullptr; - float *v1Device = nullptr; - float *v2Host = nullptr; - float *v2Device = nullptr; - - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { - deviceId = std::atoi(envDevice); - } - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVsqzNontrivialMask_kernel_2d(v1Device, v2Device, stream); - - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v2Host, fileSize_v2, v2Device, fileSize_v2, ACL_MEMCPY_DEVICE_TO_HOST)); - - WriteFile("./v2.bin", v2Host, fileSize_v2); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - if (stream != nullptr) { - const aclError _ret = aclrtDestroyStream(stream); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtDestroyStream(stream)", (int)_ret, __FILE__, __LINE__); - } - stream = nullptr; - } - if (deviceSet) { - const aclError _ret = aclrtResetDevice(deviceId); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtResetDevice(deviceId)", (int)_ret, __FILE__, __LINE__); - } - } - if (aclInited) { - const aclError _ret = aclFinalize(); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclFinalize()", (int)_ret, __FILE__, __LINE__); - } - } - - return rc; -} diff --git a/test/vpto/cases/micro-op/rearrangement/vsqz/compare.py b/test/vpto/cases/micro-op/rearrangement/vsqz/compare.py deleted file mode 100755 index f10e14e5f..000000000 --- a/test/vpto/cases/micro-op/rearrangement/vsqz/compare.py +++ /dev/null @@ -1,209 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/rearrangement/vsqz -# family: rearrangement -# target_ops: pto.vsqz -# scenarios: predicate-driven-rearrangement, stable-order -# NOTE: bulk-generated coverage skeleton. -# coding=utf-8 - -import os -import sys -import numpy as np - - -REPEAT_BYTES = 256 - - -def _ceil_div(x, y): - return (x + y - 1) // y - - -def _packed_pred_storage_bytes(logical_elems, src_elem_bytes): - logical_elems = int(logical_elems) - src_elem_bytes = int(src_elem_bytes) - if logical_elems <= 0: - raise ValueError(f"logical_elems must be > 0, got {logical_elems}") - if src_elem_bytes not in (1, 2, 4): - raise ValueError(f"unsupported packed predicate source size: {src_elem_bytes}") - - repeat_elems = REPEAT_BYTES // src_elem_bytes - if src_elem_bytes == 4: - repeat_times = _ceil_div(logical_elems, repeat_elems) + 1 - loop_count = repeat_times // 2 - return loop_count * 16 - - repeat_times = _ceil_div(logical_elems, repeat_elems) - return repeat_times * (repeat_elems // 8) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - g = golden.astype(np.int64, copy=False) - o = output.astype(np.int64, copy=False) - else: - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - abs_diff = np.abs(g - o) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={g[idx]}, out={o[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_bin_prefix(golden_path, output_path, dtype, eps, count): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - try: - count = int(count) - except Exception: - print(f"[ERROR] Invalid prefix count: {count}") - return False - if count <= 0: - print(f"[ERROR] Invalid prefix count: {count}") - return False - - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np, count=count) - output = np.fromfile(output_path, dtype=dtype_np, count=count) - - if golden.size != count or output.size != count: - print( - f"[ERROR] Prefix read too small: need={count} elems, " - f"golden={golden.size}, out={output.size}" - ) - return False - - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - g = golden.astype(np.int64, copy=False) - o = output.astype(np.int64, copy=False) - else: - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - abs_diff = np.abs(g - o) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch (prefix): {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={g[idx]}, out={o[idx]}, dtype={dtype_np}, count={count})" - ) - else: - print(f"[ERROR] Mismatch (prefix): {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, logical_elems, src_elem_bytes): - """ - Compare outputs of pto.tcmp / pto.tcmps. - - PTO-ISA stores packed predicate results as a linear PK byte stream via - `psts`, with the exact written prefix length determined by the typed - TCMP/TCMPS repeat schedule. Compare only that semantic prefix. - """ - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - try: - logical_elems = int(logical_elems) - src_elem_bytes = int(src_elem_bytes) - except Exception: - print( - "[ERROR] Invalid packed mask compare arguments: " - f"logical_elems={logical_elems} src_elem_bytes={src_elem_bytes}" - ) - return False - if logical_elems <= 0 or src_elem_bytes <= 0: - print( - "[ERROR] Invalid packed mask compare arguments: " - f"logical_elems={logical_elems} src_elem_bytes={src_elem_bytes}" - ) - return False - - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - try: - prefix_bytes = _packed_pred_storage_bytes(logical_elems, src_elem_bytes) - except ValueError as exc: - print(f"[ERROR] {exc}") - return False - - if golden.size < prefix_bytes or output.size < prefix_bytes: - print( - f"[ERROR] Packed mask buffer too small: need={prefix_bytes} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - - golden_sel = golden[:prefix_bytes] - output_sel = output[:prefix_bytes] - - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, " - f"logical_elems={logical_elems}, src_elem_bytes={src_elem_bytes}, prefix_bytes={prefix_bytes})" - ) - return False - return True - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = True - ok = compare_bin("golden_v2.bin", "v2.bin", np.float32, 0.0001) and ok - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/rearrangement/vsqz/golden.py b/test/vpto/cases/micro-op/rearrangement/vsqz/golden.py deleted file mode 100755 index 5722d4362..000000000 --- a/test/vpto/cases/micro-op/rearrangement/vsqz/golden.py +++ /dev/null @@ -1,51 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/rearrangement/vsqz -# family: rearrangement -# target_ops: pto.vsqz -# scenarios: predicate-driven-rearrangement, stable-order -# NOTE: full-mask compaction should preserve original lane order. -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -ROWS = 32 -COLS = 32 -SEED = 19 - - -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) - v1 = rng.uniform(-8.0, 8.0, size=(ROWS, COLS)).astype(np.float32) - v2 = np.zeros((ROWS, COLS), dtype=np.float32) - golden_v2 = v1.copy() - - output_dir.mkdir(parents=True, exist_ok=True) - v1.reshape(-1).tofile(output_dir / "v1.bin") - v2.reshape(-1).tofile(output_dir / "v2.bin") - golden_v2.reshape(-1).tofile(output_dir / "golden_v2.bin") - - -def main() -> None: - parser = argparse.ArgumentParser( - description="Generate numpy-based inputs/golden for VPTO micro-op vsqz validation." - ) - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/rearrangement/vsqz/kernel.pto b/test/vpto/cases/micro-op/rearrangement/vsqz/kernel.pto deleted file mode 100644 index 614d4ec87..000000000 --- a/test/vpto/cases/micro-op/rearrangement/vsqz/kernel.pto +++ /dev/null @@ -1,70 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/rearrangement/vsqz -// family: rearrangement -// target_ops: pto.vsqz -// scenarios: predicate-driven-rearrangement, stable-order -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -// ============================================================================= -// abs_kernel_2d: Element-wise absolute value on a 32x32 f32 tile -// ============================================================================= -// This kernel computes abs(input) for a 32x32 float32 matrix (1024 elements). -// -// Memory Layout: -// - Input: arg0 -> GM (Global Memory) -// - Output: arg1 -> GM (Global Memory) -// - UB (Unified Buffer) at offset 0: input tile (4096 bytes = 32*32*4) -// - UB at offset 4096: output tile (4096 bytes = 32*32*4) -// -// Pipeline: -// 1. DMA: GM -> UB (MTE2 pipe) - copy input tile to UB -// 2. Sync: wait for MTE2 -> V pipe handoff -// 3. Compute: vabs on 64-element vectors (V pipe) - 16 iterations for 1024 elements -// 4. Sync: wait for V -> MTE3 pipe handoff -// 5. DMA: UB -> GM (MTE3 pipe) - copy result tile back to GM -// ============================================================================= - -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vsqz_kernel_2d(%arg0: !pto.ptr, %arg1: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c64 = arith.constant 64 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c128_i64 = arith.constant 128 : i64 - %c4096_i64 = arith.constant 4096 : i64 - %c1024_i32 = arith.constant 1024 : i32 - - %ub_in = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c4096_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_in, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %_:1 = scf.for %offset = %c0 to %c1024 step %c64 iter_args(%remaining = %c1024_i32) -> (i32) { - %mask, %next_remaining = pto.plt_b32 %remaining : i32 -> !pto.mask, i32 - %vec = pto.vlds %ub_in[%offset] : !pto.ptr -> !pto.vreg<64xf32> - %out = pto.vsqz %vec, %mask : !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> - pto.vsts %out, %ub_out[%offset], %mask : !pto.vreg<64xf32>, !pto.ptr, !pto.mask - scf.yield %next_remaining : i32 - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg1, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/rearrangement/vsqz/launch.cpp b/test/vpto/cases/micro-op/rearrangement/vsqz/launch.cpp deleted file mode 100644 index 511bb9f16..000000000 --- a/test/vpto/cases/micro-op/rearrangement/vsqz/launch.cpp +++ /dev/null @@ -1,70 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/rearrangement/vsqz -// family: rearrangement -// target_ops: pto.vsqz -// scenarios: predicate-driven-rearrangement, stable-order -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -// --------------------------------------------------------------------------- -// PTOAS compatibility layer -// -// The upstream pto-isa headers reference some FP8/FP4 types and the -// __VEC_SCOPE__ marker that are not available on every AICore arch/toolchain -// combination (e.g. __NPU_ARCH__==2201). -// -// For our PTOAS-generated kernels we don't rely on these types today, but the -// headers still mention them in templates/static_asserts. Provide minimal -// fallbacks to keep compilation working on dav-c220. -// --------------------------------------------------------------------------- -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include - -// AICore printf support is gated behind `--cce-enable-print` on some -// toolchains. When enabled, include the CCE print header so `cce::printf` -// resolves in device compilation. -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -// Some PTO-ISA types are only available in the __CCE_AICORE__ compilation -// path, but `bisheng -xcce` still performs a host-side parse pass. -// Provide minimal fallbacks only when the corresponding header wasn't -// pulled in by the selected arch implementation. -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vsqz_kernel_2d(__gm__ float *v1, - __gm__ float *v2); - -void LaunchVsqz_kernel_2d(float *v1, float *v2, void *stream) { - vsqz_kernel_2d<<<1, nullptr, stream>>>((__gm__ float *)v1, (__gm__ float *)v2); -} diff --git a/test/vpto/cases/micro-op/rearrangement/vsqz/main.cpp b/test/vpto/cases/micro-op/rearrangement/vsqz/main.cpp deleted file mode 100644 index ddb3c318b..000000000 --- a/test/vpto/cases/micro-op/rearrangement/vsqz/main.cpp +++ /dev/null @@ -1,130 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/rearrangement/vsqz -// family: rearrangement -// target_ops: pto.vsqz -// scenarios: predicate-driven-rearrangement, stable-order -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -This program is free software, you can redistribute it and/or modify it under the terms and conditions of -CANN Open Software License Agreement Version 2.0 (the "License"). -Please refer to the License for details. You may not use this file except in compliance with the License. -THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -See LICENSE in the root of the software repository for the full text of the License. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include -#include - -using namespace PtoTestCommon; - -#ifndef TMRGSORT_HPP -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, (int)_ret, __FILE__, __LINE__); \ - const char *_recent = aclGetRecentErrMsg(); \ - if (_recent != nullptr && _recent[0] != '\0') { \ - std::fprintf(stderr, "[ERROR] RecentErrMsg: %s\n", _recent); \ - } \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVsqz_kernel_2d(float *v1, float *v2, void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(float); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(float); - float *v1Host = nullptr; - float *v1Device = nullptr; - float *v2Host = nullptr; - float *v2Device = nullptr; - - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { - deviceId = std::atoi(envDevice); - } - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVsqz_kernel_2d(v1Device, v2Device, stream); - - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v2Host, fileSize_v2, v2Device, fileSize_v2, ACL_MEMCPY_DEVICE_TO_HOST)); - - WriteFile("./v2.bin", v2Host, fileSize_v2); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - if (stream != nullptr) { - const aclError _ret = aclrtDestroyStream(stream); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtDestroyStream(stream)", (int)_ret, __FILE__, __LINE__); - } - stream = nullptr; - } - if (deviceSet) { - const aclError _ret = aclrtResetDevice(deviceId); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtResetDevice(deviceId)", (int)_ret, __FILE__, __LINE__); - } - } - if (aclInited) { - const aclError _ret = aclFinalize(); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclFinalize()", (int)_ret, __FILE__, __LINE__); - } - } - - return rc; -} diff --git a/test/vpto/cases/micro-op/rearrangement/vsunpack/kernel.pto b/test/vpto/cases/micro-op/rearrangement/vsunpack/kernel.pto index bd98ba521..dbe382a45 100644 --- a/test/vpto/cases/micro-op/rearrangement/vsunpack/kernel.pto +++ b/test/vpto/cases/micro-op/rearrangement/vsunpack/kernel.pto @@ -67,6 +67,52 @@ module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind, !pto.ptr, i64, i64, i64, i64 pto.barrier #pto.pipe + %__case_merge_guard = arith.constant false + + // inactive cross-case merge from micro-op/rearrangement/vzunpack + scf.if %__case_merge_guard { + + %c0_cmg13_1 = arith.constant 0 : index + %c64_cmg13_1 = arith.constant 64 : index + %c2_cmg13_1 = arith.constant 2 : index + %c1024_cmg13_1 = arith.constant 1024 : index + %c0_i64_cmg13_1 = arith.constant 0 : i64 + %c1_i64_cmg13_1 = arith.constant 1 : i64 + %c32_i64_cmg13_1 = arith.constant 32 : i64 + %c128_i64_cmg13_1 = arith.constant 128 : i64 + %c4096_i64_cmg13_1 = arith.constant 4096 : i64 + %part_cmg13_1 = arith.constant 0 : index + + %gm_in_cmg13_1 = pto.castptr %arg0 : !pto.ptr -> !pto.ptr + %gm_out_cmg13_1 = pto.castptr %arg1 : !pto.ptr -> !pto.ptr + %ub_in_cmg13_1 = pto.castptr %c0_i64_cmg13_1 : i64 -> !pto.ptr + %ub_out_cmg13_1 = pto.castptr %c4096_i64_cmg13_1 : i64 -> !pto.ptr + + %false_cmg13_1 = arith.constant false + pto.mte_gm_ub %gm_in_cmg13_1, %ub_in_cmg13_1, %c0_i64_cmg13_1, %c128_i64_cmg13_1 + nburst(%c32_i64_cmg13_1, %c128_i64_cmg13_1, %c128_i64_cmg13_1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %store_mask_cmg13_1 = pto.pset_b32 "PAT_ALL" : !pto.mask + scf.for %offset_cmg13_1 = %c0_cmg13_1 to %c1024_cmg13_1 step %c64_cmg13_1 { + %src_offset_cmg13_1 = arith.muli %offset_cmg13_1, %c2_cmg13_1 : index + %vec_cmg13_1 = pto.vlds %ub_in_cmg13_1[%src_offset_cmg13_1] : !pto.ptr -> !pto.vreg<128xui16> + %out_cmg13_1 = pto.vzunpack %vec_cmg13_1, %part_cmg13_1 : !pto.vreg<128xui16> -> !pto.vreg<64xui32> + pto.vsts %out_cmg13_1, %ub_out_cmg13_1[%offset_cmg13_1], %store_mask_cmg13_1 : !pto.vreg<64xui32>, !pto.ptr, !pto.mask + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_cmg13_1, %gm_out_cmg13_1, %c128_i64_cmg13_1 + nburst(%c32_i64_cmg13_1, %c128_i64_cmg13_1, %c128_i64_cmg13_1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + } return } } diff --git a/test/vpto/cases/micro-op/rearrangement/vusqz-nontrivial-mask/compare.py b/test/vpto/cases/micro-op/rearrangement/vusqz-nontrivial-mask/compare.py deleted file mode 100644 index c5d68b8e4..000000000 --- a/test/vpto/cases/micro-op/rearrangement/vusqz-nontrivial-mask/compare.py +++ /dev/null @@ -1,36 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/rearrangement/vusqz-nontrivial-mask -# family: rearrangement -# target_ops: pto.vusqz -# scenarios: predicate-driven-rearrangement, prefix-count - -import sys -import numpy as np - - -def main() -> None: - golden = np.fromfile("golden_v3.bin", dtype=np.int32) - output = np.fromfile("v3.bin", dtype=np.int32) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden.shape} vs {output.shape}") - sys.exit(2) - if not np.array_equal(golden, output): - diff = np.nonzero(golden != output)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch at idx={idx}: golden={int(golden[idx])} out={int(output[idx])}" - ) - sys.exit(2) - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/rearrangement/vusqz-nontrivial-mask/golden.py b/test/vpto/cases/micro-op/rearrangement/vusqz-nontrivial-mask/golden.py deleted file mode 100644 index 81fc36cbc..000000000 --- a/test/vpto/cases/micro-op/rearrangement/vusqz-nontrivial-mask/golden.py +++ /dev/null @@ -1,69 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/rearrangement/vusqz-nontrivial-mask -# family: rearrangement -# target_ops: pto.vusqz -# scenarios: predicate-driven-rearrangement, prefix-count - -import argparse -from pathlib import Path - -import numpy as np - - -ROWS = 32 -COLS = 32 -LANES = 64 -BLOCKS = ROWS * COLS // LANES -ACTIVE_POSITIONS = [1, 4, 5, 9, 12, 16, 21, 24, 29, 33, 36, 40, 45, 49, 54, 60] -SEED = 19 - - -def build_case() -> tuple[np.ndarray, np.ndarray, np.ndarray]: - src = np.zeros((BLOCKS, LANES), dtype=np.int32) - mask_seed = np.full((BLOCKS, LANES), -1.0, dtype=np.float32) - out = np.zeros((BLOCKS, LANES), dtype=np.int32) - - for block in range(BLOCKS): - src[block] = np.arange(block * 1000 + 7, block * 1000 + 7 + LANES, dtype=np.int32) - for pos in ACTIVE_POSITIONS: - mask_seed[block, pos] = 1.0 - active_count = 0 - out[block, 0] = 0 - for lane in range(1, LANES): - if mask_seed[block, lane - 1] > 0.0: - active_count += 1 - out[block, lane] = active_count - - return src.reshape(ROWS, COLS), mask_seed.reshape(ROWS, COLS), out.reshape(ROWS, COLS) - - -def generate(output_dir: Path) -> None: - src, mask_seed, out = build_case() - output_dir.mkdir(parents=True, exist_ok=True) - src.reshape(-1).tofile(output_dir / "v1.bin") - mask_seed.reshape(-1).tofile(output_dir / "v2.bin") - out.reshape(-1).tofile(output_dir / "golden_v3.bin") - np.zeros_like(out.reshape(-1)).tofile(output_dir / "v3.bin") - - -def main() -> None: - parser = argparse.ArgumentParser( - description="Generate vusqz nontrivial prefix-count inputs/golden." - ) - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - del args.seed - generate(args.output_dir) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/rearrangement/vusqz-nontrivial-mask/kernel.pto b/test/vpto/cases/micro-op/rearrangement/vusqz-nontrivial-mask/kernel.pto deleted file mode 100644 index 440685449..000000000 --- a/test/vpto/cases/micro-op/rearrangement/vusqz-nontrivial-mask/kernel.pto +++ /dev/null @@ -1,59 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/rearrangement/vusqz-nontrivial-mask -// family: rearrangement -// target_ops: pto.vusqz -// scenarios: predicate-driven-rearrangement, prefix-count -// ----------------------------------------------------------------------------- -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vusqz_nontrivial_mask_kernel_2d(%arg0: !pto.ptr, - %arg1: !pto.ptr, - %arg2: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c64 = arith.constant 64 : index - %c1024 = arith.constant 1024 : index - - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c64_i32 = arith.constant 64 : i32 - %c128_i64 = arith.constant 128 : i64 - %c4096_i64 = arith.constant 4096 : i64 - %c8192_i64 = arith.constant 8192 : i64 - %zero_f32 = arith.constant 0.0 : f32 - - %ub_src = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_mask_seed = pto.castptr %c4096_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c8192_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_src, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - pto.mte_gm_ub %arg1, %ub_mask_seed, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - scf.for %offset = %c0 to %c1024 step %c64 { - %store_mask, %unused = pto.plt_b32 %c64_i32 : i32 -> !pto.mask, i32 - %src = pto.vlds %ub_src[%offset] : !pto.ptr -> !pto.vreg<64xi32> - %mask_seed = pto.vlds %ub_mask_seed[%offset] : !pto.ptr -> !pto.vreg<64xf32> - %place = pto.vcmps %mask_seed, %zero_f32, %store_mask, "gt" : !pto.vreg<64xf32>, f32, !pto.mask -> !pto.mask - %out = pto.vusqz %src, %place : !pto.vreg<64xi32>, !pto.mask -> !pto.vreg<64xi32> - pto.vsts %out, %ub_out[%offset], %store_mask : !pto.vreg<64xi32>, !pto.ptr, !pto.mask - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg2, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/rearrangement/vusqz-nontrivial-mask/launch.cpp b/test/vpto/cases/micro-op/rearrangement/vusqz-nontrivial-mask/launch.cpp deleted file mode 100644 index e9edd85e3..000000000 --- a/test/vpto/cases/micro-op/rearrangement/vusqz-nontrivial-mask/launch.cpp +++ /dev/null @@ -1,55 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/rearrangement/vusqz-nontrivial-mask -// family: rearrangement -// target_ops: pto.vusqz -// scenarios: predicate-driven-rearrangement, prefix-count -// ----------------------------------------------------------------------------- -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include - -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vusqz_nontrivial_mask_kernel_2d(__gm__ int32_t *v1, - __gm__ float *v2, - __gm__ int32_t *v3); - -void LaunchVusqz_nontrivial_mask_kernel_2d(int32_t *v1, - float *v2, - int32_t *v3, - void *stream) { - vusqz_nontrivial_mask_kernel_2d<<<1, nullptr, stream>>>( - (__gm__ int32_t *)v1, (__gm__ float *)v2, (__gm__ int32_t *)v3); -} diff --git a/test/vpto/cases/micro-op/rearrangement/vusqz-nontrivial-mask/main.cpp b/test/vpto/cases/micro-op/rearrangement/vusqz-nontrivial-mask/main.cpp deleted file mode 100644 index 50190e7f3..000000000 --- a/test/vpto/cases/micro-op/rearrangement/vusqz-nontrivial-mask/main.cpp +++ /dev/null @@ -1,103 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/rearrangement/vusqz-nontrivial-mask -// family: rearrangement -// target_ops: pto.vusqz -// scenarios: predicate-driven-rearrangement, prefix-count -// ----------------------------------------------------------------------------- -#include "test_common.h" -#include "acl/acl.h" -#include -#include -#include -#include - -using namespace PtoTestCommon; - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, (int)_ret, \ - __FILE__, __LINE__); \ - const char *_recent = aclGetRecentErrMsg(); \ - if (_recent != nullptr && _recent[0] != '\0') \ - std::fprintf(stderr, "[ERROR] RecentErrMsg: %s\n", _recent); \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVusqz_nontrivial_mask_kernel_2d(int32_t *v1, - float *v2, - int32_t *v3, - void *stream); - -int main() { - constexpr size_t elemCount = 1024; - size_t fileSizeV1 = elemCount * sizeof(int32_t); - size_t fileSizeV2 = elemCount * sizeof(float); - size_t fileSizeV3 = elemCount * sizeof(int32_t); - int32_t *v1Host = nullptr; - int32_t *v1Device = nullptr; - float *v2Host = nullptr; - float *v2Device = nullptr; - int32_t *v3Host = nullptr; - int32_t *v3Device = nullptr; - - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) - deviceId = std::atoi(envDevice); - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSizeV1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSizeV2)); - ACL_CHECK(aclrtMallocHost((void **)(&v3Host), fileSizeV3)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSizeV1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSizeV2, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v3Device, fileSizeV3, ACL_MEM_MALLOC_HUGE_FIRST)); - - ReadFile("./v1.bin", fileSizeV1, v1Host, fileSizeV1); - ReadFile("./v2.bin", fileSizeV2, v2Host, fileSizeV2); - std::fill_n(v3Host, elemCount, 0); - ACL_CHECK(aclrtMemcpy(v1Device, fileSizeV1, v1Host, fileSizeV1, ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSizeV2, v2Host, fileSizeV2, ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v3Device, fileSizeV3, v3Host, fileSizeV3, ACL_MEMCPY_HOST_TO_DEVICE)); - - LaunchVusqz_nontrivial_mask_kernel_2d(v1Device, v2Device, v3Device, stream); - - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v3Host, fileSizeV3, v3Device, fileSizeV3, ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v3.bin", v3Host, fileSizeV3); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFree(v3Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - aclrtFreeHost(v3Host); - if (stream != nullptr) - (void)aclrtDestroyStream(stream); - if (deviceSet) - (void)aclrtResetDevice(deviceId); - if (aclInited) - (void)aclFinalize(); - return rc; -} diff --git a/test/vpto/cases/micro-op/rearrangement/vusqz/kernel.pto b/test/vpto/cases/micro-op/rearrangement/vusqz/kernel.pto index 7fa0e91db..8e9210b8c 100644 --- a/test/vpto/cases/micro-op/rearrangement/vusqz/kernel.pto +++ b/test/vpto/cases/micro-op/rearrangement/vusqz/kernel.pto @@ -1,59 +1,106 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/rearrangement/vusqz -// family: rearrangement -// target_ops: pto.vusqz -// scenarios: predicate-driven-rearrangement, prefix-count -// ----------------------------------------------------------------------------- module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vusqz_kernel_2d(%arg0: !pto.ptr, - %arg1: !pto.ptr, - %arg2: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c64 = arith.constant 64 : index - %c1024 = arith.constant 1024 : index - - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c64_i32 = arith.constant 64 : i32 - %c128_i64 = arith.constant 128 : i64 - %c4096_i64 = arith.constant 4096 : i64 - %c8192_i64 = arith.constant 8192 : i64 - %zero_f32 = arith.constant 0.0 : f32 - - %ub_src = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_mask_seed = pto.castptr %c4096_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c8192_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_src, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) + func.func @vusqz_deep_merged_kernel(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: !pto.ptr, %arg4: !pto.ptr, %arg5: !pto.ptr) attributes {pto.kernel} { + %__deep_merge_guard = arith.constant false + // inactive merged from vusqz_nontrivial_mask_kernel_2d + scf.if %__deep_merge_guard { + + %c0_m0 = arith.constant 0 : index + %c1_m0 = arith.constant 1 : index + %c64_m0 = arith.constant 64 : index + %c1024_m0 = arith.constant 1024 : index + + %c0_i64_m0 = arith.constant 0 : i64 + %c1_i64_m0 = arith.constant 1 : i64 + %c32_i64_m0 = arith.constant 32 : i64 + %c64_i32_m0 = arith.constant 64 : i32 + %c128_i64_m0 = arith.constant 128 : i64 + %c4096_i64_m0 = arith.constant 4096 : i64 + %c8192_i64_m0 = arith.constant 8192 : i64 + %zero_f32_m0 = arith.constant 0.0 : f32 + + %ub_src_m0 = pto.castptr %c0_i64_m0 : i64 -> !pto.ptr + %ub_mask_seed_m0 = pto.castptr %c4096_i64_m0 : i64 -> !pto.ptr + %ub_out_m0 = pto.castptr %c8192_i64_m0 : i64 -> !pto.ptr + + %false_m0 = arith.constant false + pto.mte_gm_ub %arg0, %ub_src_m0, %c0_i64_m0, %c128_i64_m0 + nburst(%c32_i64_m0, %c128_i64_m0, %c128_i64_m0) : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - pto.mte_gm_ub %arg1, %ub_mask_seed, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) + pto.mte_gm_ub %arg1, %ub_mask_seed_m0, %c0_i64_m0, %c128_i64_m0 + nburst(%c32_i64_m0, %c128_i64_m0, %c128_i64_m0) : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] pto.vecscope { - scf.for %offset = %c0 to %c1024 step %c64 { - %store_mask, %unused = pto.plt_b32 %c64_i32 : i32 -> !pto.mask, i32 - %src = pto.vlds %ub_src[%offset] : !pto.ptr -> !pto.vreg<64xi32> - %mask_seed = pto.vlds %ub_mask_seed[%offset] : !pto.ptr -> !pto.vreg<64xf32> - %place = pto.vcmps %mask_seed, %zero_f32, %store_mask, "gt" : !pto.vreg<64xf32>, f32, !pto.mask -> !pto.mask - %out = pto.vusqz %src, %place : !pto.vreg<64xi32>, !pto.mask -> !pto.vreg<64xi32> - pto.vsts %out, %ub_out[%offset], %store_mask : !pto.vreg<64xi32>, !pto.ptr, !pto.mask + scf.for %offset_m0 = %c0_m0 to %c1024_m0 step %c64_m0 { + %store_mask_m0, %unused_m0 = pto.plt_b32 %c64_i32_m0 : i32 -> !pto.mask, i32 + %src_m0 = pto.vlds %ub_src_m0[%offset_m0] : !pto.ptr -> !pto.vreg<64xi32> + %mask_seed_m0 = pto.vlds %ub_mask_seed_m0[%offset_m0] : !pto.ptr -> !pto.vreg<64xf32> + %place_m0 = pto.vcmps %mask_seed_m0, %zero_f32_m0, %store_mask_m0, "gt" : !pto.vreg<64xf32>, f32, !pto.mask -> !pto.mask + %out_m0 = pto.vusqz %src_m0, %place_m0 : !pto.vreg<64xi32>, !pto.mask -> !pto.vreg<64xi32> + pto.vsts %out_m0, %ub_out_m0[%offset_m0], %store_mask_m0 : !pto.vreg<64xi32>, !pto.ptr, !pto.mask } } pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg2, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) + pto.mte_ub_gm %ub_out_m0, %arg2, %c128_i64_m0 + nburst(%c32_i64_m0, %c128_i64_m0, %c128_i64_m0) : !pto.ptr, !pto.ptr, i64, i64, i64, i64 pto.barrier #pto.pipe + + } + // active merged from vusqz_kernel_2d + + %c0_m1 = arith.constant 0 : index + %c1_m1 = arith.constant 1 : index + %c64_m1 = arith.constant 64 : index + %c1024_m1 = arith.constant 1024 : index + + %c0_i64_m1 = arith.constant 0 : i64 + %c1_i64_m1 = arith.constant 1 : i64 + %c32_i64_m1 = arith.constant 32 : i64 + %c64_i32_m1 = arith.constant 64 : i32 + %c128_i64_m1 = arith.constant 128 : i64 + %c4096_i64_m1 = arith.constant 4096 : i64 + %c8192_i64_m1 = arith.constant 8192 : i64 + %zero_f32_m1 = arith.constant 0.0 : f32 + + %ub_src_m1 = pto.castptr %c0_i64_m1 : i64 -> !pto.ptr + %ub_mask_seed_m1 = pto.castptr %c4096_i64_m1 : i64 -> !pto.ptr + %ub_out_m1 = pto.castptr %c8192_i64_m1 : i64 -> !pto.ptr + + %false_m1 = arith.constant false + pto.mte_gm_ub %arg3, %ub_src_m1, %c0_i64_m1, %c128_i64_m1 + nburst(%c32_i64_m1, %c128_i64_m1, %c128_i64_m1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + pto.mte_gm_ub %arg4, %ub_mask_seed_m1, %c0_i64_m1, %c128_i64_m1 + nburst(%c32_i64_m1, %c128_i64_m1, %c128_i64_m1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + scf.for %offset_m1 = %c0_m1 to %c1024_m1 step %c64_m1 { + %store_mask_m1, %unused_m1 = pto.plt_b32 %c64_i32_m1 : i32 -> !pto.mask, i32 + %src_m1 = pto.vlds %ub_src_m1[%offset_m1] : !pto.ptr -> !pto.vreg<64xi32> + %mask_seed_m1 = pto.vlds %ub_mask_seed_m1[%offset_m1] : !pto.ptr -> !pto.vreg<64xf32> + %place_m1 = pto.vcmps %mask_seed_m1, %zero_f32_m1, %store_mask_m1, "gt" : !pto.vreg<64xf32>, f32, !pto.mask -> !pto.mask + %out_m1 = pto.vusqz %src_m1, %place_m1 : !pto.vreg<64xi32>, !pto.mask -> !pto.vreg<64xi32> + pto.vsts %out_m1, %ub_out_m1[%offset_m1], %store_mask_m1 : !pto.vreg<64xi32>, !pto.ptr, !pto.mask + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m1, %arg5, %c128_i64_m1 + nburst(%c32_i64_m1, %c128_i64_m1, %c128_i64_m1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + return } } diff --git a/test/vpto/cases/micro-op/rearrangement/vusqz/launch.cpp b/test/vpto/cases/micro-op/rearrangement/vusqz/launch.cpp index 3684fe9b2..345afab74 100644 --- a/test/vpto/cases/micro-op/rearrangement/vusqz/launch.cpp +++ b/test/vpto/cases/micro-op/rearrangement/vusqz/launch.cpp @@ -5,17 +5,9 @@ // THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, // INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. // See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/rearrangement/vusqz -// family: rearrangement -// target_ops: pto.vusqz -// scenarios: predicate-driven-rearrangement, prefix-count -// ----------------------------------------------------------------------------- #ifndef __VEC_SCOPE__ #define __VEC_SCOPE__ #endif - #if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) typedef struct { unsigned char v; } hifloat8_t; typedef struct { unsigned char v; } float8_e4m3_t; @@ -25,28 +17,30 @@ typedef struct { unsigned char v; } float4_e1m2x2_t; typedef struct { unsigned char v; } float4_e2m1x2_t; #endif #include - #if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) #include #endif - #if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; +struct MrgSortExecutedNumList { uint16_t mrgSortList0,mrgSortList1,mrgSortList2,mrgSortList3; }; #endif #ifndef __CPU_SIM #include "acl/acl.h" #endif -extern "C" __global__ [aicore] void vusqz_kernel_2d(__gm__ int32_t *v1, - __gm__ float *v2, - __gm__ int32_t *v3); +extern "C" __global__ [aicore] void vusqz_deep_merged_kernel( + __gm__ int32_t * arg0, + __gm__ float * arg1, + __gm__ int32_t * arg2, + __gm__ int32_t * arg3, + __gm__ float * arg4, + __gm__ int32_t * arg5); -void LaunchVusqz_kernel_2d(int32_t *v1, float *v2, int32_t *v3, void *stream) { - vusqz_kernel_2d<<<1, nullptr, stream>>>( - (__gm__ int32_t *)v1, (__gm__ float *)v2, (__gm__ int32_t *)v3); +void LaunchVusqzDeepMerged(int32_t * p0, float * p1, int32_t * p2, void *stream) { + vusqz_deep_merged_kernel<<<1, nullptr, stream>>>( + (__gm__ int32_t *)p0, + (__gm__ float *)p0, + (__gm__ int32_t *)p0, + (__gm__ int32_t *)p0, + (__gm__ float *)p1, + (__gm__ int32_t *)p2); } diff --git a/test/vpto/cases/micro-op/rearrangement/vusqz/main.cpp b/test/vpto/cases/micro-op/rearrangement/vusqz/main.cpp index 9da958163..4008edc74 100644 --- a/test/vpto/cases/micro-op/rearrangement/vusqz/main.cpp +++ b/test/vpto/cases/micro-op/rearrangement/vusqz/main.cpp @@ -35,8 +35,8 @@ using namespace PtoTestCommon; } \ } while (0) -void LaunchVusqz_kernel_2d(int32_t *v1, float *v2, int32_t *v3, void *stream); +void LaunchVusqzDeepMerged(int32_t * p0, float * p1, int32_t * p2, void *stream); int main() { constexpr size_t elemCount = 1024; size_t fileSizeV1 = elemCount * sizeof(int32_t); @@ -77,7 +77,7 @@ int main() { ACL_CHECK(aclrtMemcpy(v2Device, fileSizeV2, v2Host, fileSizeV2, ACL_MEMCPY_HOST_TO_DEVICE)); ACL_CHECK(aclrtMemcpy(v3Device, fileSizeV3, v3Host, fileSizeV3, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVusqz_kernel_2d(v1Device, v2Device, v3Device, stream); + LaunchVusqzDeepMerged(v1Device, v2Device, v3Device, stream); ACL_CHECK(aclrtSynchronizeStream(stream)); ACL_CHECK(aclrtMemcpy(v3Host, fileSizeV3, v3Device, fileSizeV3, ACL_MEMCPY_DEVICE_TO_HOST)); diff --git a/test/vpto/cases/micro-op/rearrangement/vzunpack/compare.py b/test/vpto/cases/micro-op/rearrangement/vzunpack/compare.py deleted file mode 100755 index 0dc97cc35..000000000 --- a/test/vpto/cases/micro-op/rearrangement/vzunpack/compare.py +++ /dev/null @@ -1,209 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/rearrangement/vzunpack -# family: rearrangement -# target_ops: pto.vzunpack -# scenarios: pack-unpack, zero-extend -# NOTE: bulk-generated coverage skeleton. -# coding=utf-8 - -import os -import sys -import numpy as np - - -REPEAT_BYTES = 256 - - -def _ceil_div(x, y): - return (x + y - 1) // y - - -def _packed_pred_storage_bytes(logical_elems, src_elem_bytes): - logical_elems = int(logical_elems) - src_elem_bytes = int(src_elem_bytes) - if logical_elems <= 0: - raise ValueError(f"logical_elems must be > 0, got {logical_elems}") - if src_elem_bytes not in (1, 2, 4): - raise ValueError(f"unsupported packed predicate source size: {src_elem_bytes}") - - repeat_elems = REPEAT_BYTES // src_elem_bytes - if src_elem_bytes == 4: - repeat_times = _ceil_div(logical_elems, repeat_elems) + 1 - loop_count = repeat_times // 2 - return loop_count * 16 - - repeat_times = _ceil_div(logical_elems, repeat_elems) - return repeat_times * (repeat_elems // 8) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - g = golden.astype(np.int64, copy=False) - o = output.astype(np.int64, copy=False) - else: - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - abs_diff = np.abs(g - o) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={g[idx]}, out={o[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_bin_prefix(golden_path, output_path, dtype, eps, count): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - try: - count = int(count) - except Exception: - print(f"[ERROR] Invalid prefix count: {count}") - return False - if count <= 0: - print(f"[ERROR] Invalid prefix count: {count}") - return False - - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np, count=count) - output = np.fromfile(output_path, dtype=dtype_np, count=count) - - if golden.size != count or output.size != count: - print( - f"[ERROR] Prefix read too small: need={count} elems, " - f"golden={golden.size}, out={output.size}" - ) - return False - - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - g = golden.astype(np.int64, copy=False) - o = output.astype(np.int64, copy=False) - else: - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - abs_diff = np.abs(g - o) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch (prefix): {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={g[idx]}, out={o[idx]}, dtype={dtype_np}, count={count})" - ) - else: - print(f"[ERROR] Mismatch (prefix): {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, logical_elems, src_elem_bytes): - """ - Compare outputs of pto.tcmp / pto.tcmps. - - PTO-ISA stores packed predicate results as a linear PK byte stream via - `psts`, with the exact written prefix length determined by the typed - TCMP/TCMPS repeat schedule. Compare only that semantic prefix. - """ - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - try: - logical_elems = int(logical_elems) - src_elem_bytes = int(src_elem_bytes) - except Exception: - print( - "[ERROR] Invalid packed mask compare arguments: " - f"logical_elems={logical_elems} src_elem_bytes={src_elem_bytes}" - ) - return False - if logical_elems <= 0 or src_elem_bytes <= 0: - print( - "[ERROR] Invalid packed mask compare arguments: " - f"logical_elems={logical_elems} src_elem_bytes={src_elem_bytes}" - ) - return False - - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - try: - prefix_bytes = _packed_pred_storage_bytes(logical_elems, src_elem_bytes) - except ValueError as exc: - print(f"[ERROR] {exc}") - return False - - if golden.size < prefix_bytes or output.size < prefix_bytes: - print( - f"[ERROR] Packed mask buffer too small: need={prefix_bytes} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - - golden_sel = golden[:prefix_bytes] - output_sel = output[:prefix_bytes] - - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, " - f"logical_elems={logical_elems}, src_elem_bytes={src_elem_bytes}, prefix_bytes={prefix_bytes})" - ) - return False - return True - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = True - ok = compare_bin("golden_v2.bin", "v2.bin", np.uint32, 0.0) and ok - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/rearrangement/vzunpack/golden.py b/test/vpto/cases/micro-op/rearrangement/vzunpack/golden.py deleted file mode 100755 index e6014e397..000000000 --- a/test/vpto/cases/micro-op/rearrangement/vzunpack/golden.py +++ /dev/null @@ -1,54 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/rearrangement/vzunpack -# family: rearrangement -# target_ops: pto.vzunpack -# scenarios: pack-unpack, zero-extend -# NOTE: zero-extending unpack of the lower half of each 128-lane ui16 chunk. -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -INPUT_ELEMS = 2048 -OUTPUT_ELEMS = 1024 -SEED = 19 - - -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) - v1 = rng.integers(0, np.iinfo(np.uint16).max + 1, size=INPUT_ELEMS, dtype=np.uint16) - v2 = np.zeros(OUTPUT_ELEMS, dtype=np.uint32) - golden_v2 = np.zeros(OUTPUT_ELEMS, dtype=np.uint32) - for src_base in range(0, INPUT_ELEMS, 128): - dst_base = (src_base // 128) * 64 - golden_v2[dst_base : dst_base + 64] = v1[src_base : src_base + 64].astype(np.uint32) - - output_dir.mkdir(parents=True, exist_ok=True) - v1.tofile(output_dir / "v1.bin") - v2.tofile(output_dir / "v2.bin") - golden_v2.tofile(output_dir / "golden_v2.bin") - - -def main() -> None: - parser = argparse.ArgumentParser( - description="Generate numpy-based inputs/golden for VPTO micro-op vzunpack validation." - ) - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/rearrangement/vzunpack/kernel.pto b/test/vpto/cases/micro-op/rearrangement/vzunpack/kernel.pto deleted file mode 100644 index 47870c2ec..000000000 --- a/test/vpto/cases/micro-op/rearrangement/vzunpack/kernel.pto +++ /dev/null @@ -1,72 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/rearrangement/vzunpack -// family: rearrangement -// target_ops: pto.vzunpack -// scenarios: pack-unpack, zero-extend -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -// ============================================================================= -// abs_kernel_2d: Element-wise absolute value on a 32x32 f32 tile -// ============================================================================= -// This kernel computes abs(input) for a 32x32 float32 matrix (1024 elements). -// -// Memory Layout: -// - Input: arg0 -> GM (Global Memory) -// - Output: arg1 -> GM (Global Memory) -// - UB (Unified Buffer) at offset 0: input tile (4096 bytes = 32*32*4) -// - UB at offset 4096: output tile (4096 bytes = 32*32*4) -// -// Pipeline: -// 1. DMA: GM -> UB (MTE2 pipe) - copy input tile to UB -// 2. Sync: wait for MTE2 -> V pipe handoff -// 3. Compute: vabs on 64-element vectors (V pipe) - 16 iterations for 1024 elements -// 4. Sync: wait for V -> MTE3 pipe handoff -// 5. DMA: UB -> GM (MTE3 pipe) - copy result tile back to GM -// ============================================================================= - -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vzunpack_kernel_2d(%arg0: !pto.ptr, %arg1: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c64 = arith.constant 64 : index - %c2 = arith.constant 2 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c128_i64 = arith.constant 128 : i64 - %c4096_i64 = arith.constant 4096 : i64 - %part = arith.constant 0 : index - - %gm_in = pto.castptr %arg0 : !pto.ptr -> !pto.ptr - %gm_out = pto.castptr %arg1 : !pto.ptr -> !pto.ptr - %ub_in = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c4096_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %gm_in, %ub_in, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %store_mask = pto.pset_b32 "PAT_ALL" : !pto.mask - scf.for %offset = %c0 to %c1024 step %c64 { - %src_offset = arith.muli %offset, %c2 : index - %vec = pto.vlds %ub_in[%src_offset] : !pto.ptr -> !pto.vreg<128xui16> - %out = pto.vzunpack %vec, %part : !pto.vreg<128xui16> -> !pto.vreg<64xui32> - pto.vsts %out, %ub_out[%offset], %store_mask : !pto.vreg<64xui32>, !pto.ptr, !pto.mask - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %gm_out, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/rearrangement/vzunpack/launch.cpp b/test/vpto/cases/micro-op/rearrangement/vzunpack/launch.cpp deleted file mode 100644 index 7fa2a6c4b..000000000 --- a/test/vpto/cases/micro-op/rearrangement/vzunpack/launch.cpp +++ /dev/null @@ -1,70 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/rearrangement/vzunpack -// family: rearrangement -// target_ops: pto.vzunpack -// scenarios: pack-unpack, zero-extend -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -// --------------------------------------------------------------------------- -// PTOAS compatibility layer -// -// The upstream pto-isa headers reference some FP8/FP4 types and the -// __VEC_SCOPE__ marker that are not available on every AICore arch/toolchain -// combination (e.g. __NPU_ARCH__==2201). -// -// For our PTOAS-generated kernels we don't rely on these types today, but the -// headers still mention them in templates/static_asserts. Provide minimal -// fallbacks to keep compilation working on dav-c220. -// --------------------------------------------------------------------------- -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include - -// AICore printf support is gated behind `--cce-enable-print` on some -// toolchains. When enabled, include the CCE print header so `cce::printf` -// resolves in device compilation. -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -// Some PTO-ISA types are only available in the __CCE_AICORE__ compilation -// path, but `bisheng -xcce` still performs a host-side parse pass. -// Provide minimal fallbacks only when the corresponding header wasn't -// pulled in by the selected arch implementation. -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vzunpack_kernel_2d(__gm__ float *v1, - __gm__ float *v2); - -void LaunchVzunpack_kernel_2d(float *v1, float *v2, void *stream) { - vzunpack_kernel_2d<<<1, nullptr, stream>>>((__gm__ float *)v1, (__gm__ float *)v2); -} diff --git a/test/vpto/cases/micro-op/rearrangement/vzunpack/main.cpp b/test/vpto/cases/micro-op/rearrangement/vzunpack/main.cpp deleted file mode 100644 index e3693855f..000000000 --- a/test/vpto/cases/micro-op/rearrangement/vzunpack/main.cpp +++ /dev/null @@ -1,132 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/rearrangement/vzunpack -// family: rearrangement -// target_ops: pto.vzunpack -// scenarios: pack-unpack, zero-extend -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -This program is free software, you can redistribute it and/or modify it under the terms and conditions of -CANN Open Software License Agreement Version 2.0 (the "License"). -Please refer to the License for details. You may not use this file except in compliance with the License. -THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -See LICENSE in the root of the software repository for the full text of the License. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include -#include - -using namespace PtoTestCommon; - -#ifndef TMRGSORT_HPP -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, (int)_ret, __FILE__, __LINE__); \ - const char *_recent = aclGetRecentErrMsg(); \ - if (_recent != nullptr && _recent[0] != '\0') { \ - std::fprintf(stderr, "[ERROR] RecentErrMsg: %s\n", _recent); \ - } \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVzunpack_kernel_2d(float *v1, float *v2, void *stream); - -int main() { - size_t elemCount_v1 = 2048; - size_t fileSize_v1 = elemCount_v1 * sizeof(uint16_t); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(uint32_t); - uint16_t *v1Host = nullptr; - uint16_t *v1Device = nullptr; - uint32_t *v2Host = nullptr; - uint32_t *v2Device = nullptr; - - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { - deviceId = std::atoi(envDevice); - } - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVzunpack_kernel_2d(reinterpret_cast(v1Device), - reinterpret_cast(v2Device), - stream); - - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v2Host, fileSize_v2, v2Device, fileSize_v2, ACL_MEMCPY_DEVICE_TO_HOST)); - - WriteFile("./v2.bin", v2Host, fileSize_v2); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - if (stream != nullptr) { - const aclError _ret = aclrtDestroyStream(stream); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtDestroyStream(stream)", (int)_ret, __FILE__, __LINE__); - } - stream = nullptr; - } - if (deviceSet) { - const aclError _ret = aclrtResetDevice(deviceId); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtResetDevice(deviceId)", (int)_ret, __FILE__, __LINE__); - } - } - if (aclInited) { - const aclError _ret = aclFinalize(); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclFinalize()", (int)_ret, __FILE__, __LINE__); - } - } - - return rc; -} diff --git a/test/vpto/cases/micro-op/reduction/vcadd-tail/compare.py b/test/vpto/cases/micro-op/reduction/vcadd-tail/compare.py deleted file mode 100644 index c13d79273..000000000 --- a/test/vpto/cases/micro-op/reduction/vcadd-tail/compare.py +++ /dev/null @@ -1,39 +0,0 @@ -#!/usr/bin/python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - - -import os -import sys -import numpy as np - - -def compare_bin_prefix(golden_path, output_path, dtype, eps, count): - if not os.path.exists(golden_path) or not os.path.exists(output_path): - return False - golden = np.fromfile(golden_path, dtype=dtype, count=count) - output = np.fromfile(output_path, dtype=dtype, count=count) - return golden.shape == output.shape and np.allclose( - golden, output, atol=eps, rtol=eps, equal_nan=True - ) - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_bin_prefix("golden_v2.bin", "v2.bin", np.float32, 1e-4, 1000) - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/reduction/vcadd-tail/golden.py b/test/vpto/cases/micro-op/reduction/vcadd-tail/golden.py deleted file mode 100644 index 9ea041d65..000000000 --- a/test/vpto/cases/micro-op/reduction/vcadd-tail/golden.py +++ /dev/null @@ -1,49 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -ROWS = 32 -COLS = 32 -SEED = 19 -LANES = 64 -LOGICAL_ELEMS = 1000 -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) - v1 = rng.uniform(-8.0, 8.0, size=(ROWS, COLS)).astype(np.float32) - v2 = np.zeros((ROWS, COLS), dtype=np.float32) - golden_v2 = np.zeros((ROWS, COLS), dtype=np.float32) - flat_in = v1.reshape(-1) - flat_out = golden_v2.reshape(-1) - for offset in range(0, LOGICAL_ELEMS, LANES): - chunk = flat_in[offset:min(offset + LANES, LOGICAL_ELEMS)] - flat_out[offset] = np.sum(chunk, dtype=np.float32) - - output_dir.mkdir(parents=True, exist_ok=True) - v1.reshape(-1).tofile(output_dir / "v1.bin") - v2.reshape(-1).tofile(output_dir / "v2.bin") - golden_v2.reshape(-1).tofile(output_dir / "golden_v2.bin") - - -def main() -> None: - parser = argparse.ArgumentParser() - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/reduction/vcadd-tail/kernel.pto b/test/vpto/cases/micro-op/reduction/vcadd-tail/kernel.pto deleted file mode 100644 index ee7ad51fe..000000000 --- a/test/vpto/cases/micro-op/reduction/vcadd-tail/kernel.pto +++ /dev/null @@ -1,42 +0,0 @@ -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vabs_tail_kernel_2d(%arg0: !pto.ptr, %arg1: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c64 = arith.constant 64 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c128_i64 = arith.constant 128 : i64 - %c4096_i64 = arith.constant 4096 : i64 - %c1000_i32 = arith.constant 1000 : i32 - - %ub_in = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c4096_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_in, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %_:1 = scf.for %offset = %c0 to %c1024 step %c64 iter_args(%remaining = %c1000_i32) -> (i32) { - %mask, %next_remaining = pto.plt_b32 %remaining : i32 -> !pto.mask, i32 - %vec = pto.vlds %ub_in[%offset] : !pto.ptr -> !pto.vreg<64xf32> - %out = pto.vcadd %vec, %mask : !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> - pto.vsts %out, %ub_out[%offset], %mask : !pto.vreg<64xf32>, !pto.ptr, !pto.mask - scf.yield %next_remaining : i32 - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg1, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/reduction/vcadd-tail/launch.cpp b/test/vpto/cases/micro-op/reduction/vcadd-tail/launch.cpp deleted file mode 100644 index 494bc5bf3..000000000 --- a/test/vpto/cases/micro-op/reduction/vcadd-tail/launch.cpp +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include - -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vabs_tail_kernel_2d(__gm__ float *v1, - __gm__ float *v2); - -void LaunchVabs_tail_kernel_2d(float *v1, float *v2, void *stream) { - vabs_tail_kernel_2d<<<1, nullptr, stream>>>((__gm__ float *)v1, (__gm__ float *)v2); -} diff --git a/test/vpto/cases/micro-op/reduction/vcadd-tail/main.cpp b/test/vpto/cases/micro-op/reduction/vcadd-tail/main.cpp deleted file mode 100644 index cf25e5dff..000000000 --- a/test/vpto/cases/micro-op/reduction/vcadd-tail/main.cpp +++ /dev/null @@ -1,87 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include - -using namespace PtoTestCommon; - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, \ - (int)_ret, __FILE__, __LINE__); \ - const char *_recent = aclGetRecentErrMsg(); \ - if (_recent != nullptr && _recent[0] != '\0') \ - std::fprintf(stderr, "[ERROR] RecentErrMsg: %s\n", _recent); \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVabs_tail_kernel_2d(float *v1, float *v2, void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(float); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(float); - float *v1Host = nullptr; - float *v1Device = nullptr; - float *v2Host = nullptr; - float *v2Device = nullptr; - - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) - deviceId = std::atoi(envDevice); - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVabs_tail_kernel_2d(v1Device, v2Device, stream); - - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v2Host, fileSize_v2, v2Device, fileSize_v2, ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v2.bin", v2Host, fileSize_v2); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - if (stream != nullptr) - aclrtDestroyStream(stream); - if (deviceSet) - aclrtResetDevice(deviceId); - if (aclInited) - aclFinalize(); - return rc; -} diff --git a/test/vpto/cases/micro-op/reduction/vcadd/kernel.pto b/test/vpto/cases/micro-op/reduction/vcadd/kernel.pto index a660a8be0..659b5e16f 100644 --- a/test/vpto/cases/micro-op/reduction/vcadd/kernel.pto +++ b/test/vpto/cases/micro-op/reduction/vcadd/kernel.pto @@ -1,43 +1,446 @@ module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vabs_kernel_2d(%arg0: !pto.ptr, %arg1: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c64 = arith.constant 64 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c128_i64 = arith.constant 128 : i64 - %c4096_i64 = arith.constant 4096 : i64 - %c1024_i32 = arith.constant 1024 : i32 - - %ub_in = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c4096_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_in, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) + func.func @vcadd_deep_merged_kernel(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: !pto.ptr) attributes {pto.kernel} { + %__deep_merge_guard = arith.constant false + // inactive merged from vabs_tail_kernel_2d + scf.if %__deep_merge_guard { + + %c0_m0 = arith.constant 0 : index + %c64_m0 = arith.constant 64 : index + %c1024_m0 = arith.constant 1024 : index + %c0_i64_m0 = arith.constant 0 : i64 + %c1_i64_m0 = arith.constant 1 : i64 + %c32_i64_m0 = arith.constant 32 : i64 + %c128_i64_m0 = arith.constant 128 : i64 + %c4096_i64_m0 = arith.constant 4096 : i64 + %c1000_i32_m0 = arith.constant 1000 : i32 + + %ub_in_m0 = pto.castptr %c0_i64_m0 : i64 -> !pto.ptr + %ub_out_m0 = pto.castptr %c4096_i64_m0 : i64 -> !pto.ptr + + %false_m0 = arith.constant false + pto.mte_gm_ub %arg0, %ub_in_m0, %c0_i64_m0, %c128_i64_m0 + nburst(%c32_i64_m0, %c128_i64_m0, %c128_i64_m0) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %__m0:1 = scf.for %offset_m0 = %c0_m0 to %c1024_m0 step %c64_m0 iter_args(%remaining_m0 = %c1000_i32_m0) -> (i32) { + %mask_m0, %next_remaining_m0 = pto.plt_b32 %remaining_m0 : i32 -> !pto.mask, i32 + %vec_m0 = pto.vlds %ub_in_m0[%offset_m0] : !pto.ptr -> !pto.vreg<64xf32> + %out_m0 = pto.vcadd %vec_m0, %mask_m0 : !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> + pto.vsts %out_m0, %ub_out_m0[%offset_m0], %mask_m0 : !pto.vreg<64xf32>, !pto.ptr, !pto.mask + scf.yield %next_remaining_m0 : i32 + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m0, %arg1, %c128_i64_m0 + nburst(%c32_i64_m0, %c128_i64_m0, %c128_i64_m0) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + + } + // active merged from vabs_kernel_2d + + %c0_m1 = arith.constant 0 : index + %c1_m1 = arith.constant 1 : index + %c64_m1 = arith.constant 64 : index + %c1024_m1 = arith.constant 1024 : index + %c0_i64_m1 = arith.constant 0 : i64 + %c1_i64_m1 = arith.constant 1 : i64 + %c32_i64_m1 = arith.constant 32 : i64 + %c128_i64_m1 = arith.constant 128 : i64 + %c4096_i64_m1 = arith.constant 4096 : i64 + %c1024_i32_m1 = arith.constant 1024 : i32 + + %ub_in_m1 = pto.castptr %c0_i64_m1 : i64 -> !pto.ptr + %ub_out_m1 = pto.castptr %c4096_i64_m1 : i64 -> !pto.ptr + + %false_m1 = arith.constant false + pto.mte_gm_ub %arg2, %ub_in_m1, %c0_i64_m1, %c128_i64_m1 + nburst(%c32_i64_m1, %c128_i64_m1, %c128_i64_m1) : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] pto.vecscope { - %_:1 = scf.for %offset = %c0 to %c1024 step %c64 iter_args(%remaining = %c1024_i32) -> (i32) { - %mask, %next_remaining = pto.plt_b32 %remaining : i32 -> !pto.mask, i32 - %vec = pto.vlds %ub_in[%offset] : !pto.ptr -> !pto.vreg<64xf32> - %out = pto.vcadd %vec, %mask : !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> - pto.vsts %out, %ub_out[%offset], %mask : !pto.vreg<64xf32>, !pto.ptr, !pto.mask - scf.yield %next_remaining : i32 + %__m1:1 = scf.for %offset_m1 = %c0_m1 to %c1024_m1 step %c64_m1 iter_args(%remaining_m1 = %c1024_i32_m1) -> (i32) { + %mask_m1, %next_remaining_m1 = pto.plt_b32 %remaining_m1 : i32 -> !pto.mask, i32 + %vec_m1 = pto.vlds %ub_in_m1[%offset_m1] : !pto.ptr -> !pto.vreg<64xf32> + %out_m1 = pto.vcadd %vec_m1, %mask_m1 : !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> + pto.vsts %out_m1, %ub_out_m1[%offset_m1], %mask_m1 : !pto.vreg<64xf32>, !pto.ptr, !pto.mask + scf.yield %next_remaining_m1 : i32 } } pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg1, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) + pto.mte_ub_gm %ub_out_m1, %arg3, %c128_i64_m1 + nburst(%c32_i64_m1, %c128_i64_m1, %c128_i64_m1) : !pto.ptr, !pto.ptr, i64, i64, i64, i64 pto.barrier #pto.pipe + %__case_merge_guard = arith.constant false + + // inactive cross-case merge from micro-op/reduction/vcgadd + scf.if %__case_merge_guard { + + %__deep_merge_guard_cmg14_1 = arith.constant false + // inactive merged from vcgadd_tail_kernel_2d + scf.if %__deep_merge_guard_cmg14_1 { + + %c0_m0_cmg14_1 = arith.constant 0 : index + %c1_m0_cmg14_1 = arith.constant 1 : index + %c64_m0_cmg14_1 = arith.constant 64 : index + %c1024_m0_cmg14_1 = arith.constant 1024 : index + %c0_i64_m0_cmg14_1 = arith.constant 0 : i64 + %c1_i64_m0_cmg14_1 = arith.constant 1 : i64 + %c32_i64_m0_cmg14_1 = arith.constant 32 : i64 + %c128_i64_m0_cmg14_1 = arith.constant 128 : i64 + %c4096_i64_m0_cmg14_1 = arith.constant 4096 : i64 + %c1000_i32_m0_cmg14_1 = arith.constant 1000 : i32 + + %ub_in_m0_cmg14_1 = pto.castptr %c0_i64_m0_cmg14_1 : i64 -> !pto.ptr + %ub_out_m0_cmg14_1 = pto.castptr %c4096_i64_m0_cmg14_1 : i64 -> !pto.ptr + + %false_m0_cmg14_1 = arith.constant false + pto.mte_gm_ub %arg0, %ub_in_m0_cmg14_1, %c0_i64_m0_cmg14_1, %c128_i64_m0_cmg14_1 + nburst(%c32_i64_m0_cmg14_1, %c128_i64_m0_cmg14_1, %c128_i64_m0_cmg14_1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %__m0_cmg14_1:1 = scf.for %offset_m0_cmg14_1 = %c0_m0_cmg14_1 to %c1024_m0_cmg14_1 step %c64_m0_cmg14_1 iter_args(%remaining_m0_cmg14_1 = %c1000_i32_m0_cmg14_1) -> (i32) { + %mask_m0_cmg14_1, %next_remaining_m0_cmg14_1 = pto.plt_b32 %remaining_m0_cmg14_1 : i32 -> !pto.mask, i32 + %vec_m0_cmg14_1 = pto.vlds %ub_in_m0_cmg14_1[%offset_m0_cmg14_1] : !pto.ptr -> !pto.vreg<64xf32> + %out_m0_cmg14_1 = pto.vcgadd %vec_m0_cmg14_1, %mask_m0_cmg14_1 : !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> + pto.vsts %out_m0_cmg14_1, %ub_out_m0_cmg14_1[%offset_m0_cmg14_1], %mask_m0_cmg14_1 : !pto.vreg<64xf32>, !pto.ptr, !pto.mask + scf.yield %next_remaining_m0_cmg14_1 : i32 + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m0_cmg14_1, %arg1, %c128_i64_m0_cmg14_1 + nburst(%c32_i64_m0_cmg14_1, %c128_i64_m0_cmg14_1, %c128_i64_m0_cmg14_1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + + } + // active merged from vcgadd_kernel_2d + + %c0_m1_cmg14_1 = arith.constant 0 : index + %c1_m1_cmg14_1 = arith.constant 1 : index + %c64_m1_cmg14_1 = arith.constant 64 : index + %c1024_m1_cmg14_1 = arith.constant 1024 : index + %c0_i64_m1_cmg14_1 = arith.constant 0 : i64 + %c1_i64_m1_cmg14_1 = arith.constant 1 : i64 + %c32_i64_m1_cmg14_1 = arith.constant 32 : i64 + %c128_i64_m1_cmg14_1 = arith.constant 128 : i64 + %c4096_i64_m1_cmg14_1 = arith.constant 4096 : i64 + %c1024_i32_m1_cmg14_1 = arith.constant 1024 : i32 + + %ub_in_m1_cmg14_1 = pto.castptr %c0_i64_m1_cmg14_1 : i64 -> !pto.ptr + %ub_out_m1_cmg14_1 = pto.castptr %c4096_i64_m1_cmg14_1 : i64 -> !pto.ptr + + %false_m1_cmg14_1 = arith.constant false + pto.mte_gm_ub %arg2, %ub_in_m1_cmg14_1, %c0_i64_m1_cmg14_1, %c128_i64_m1_cmg14_1 + nburst(%c32_i64_m1_cmg14_1, %c128_i64_m1_cmg14_1, %c128_i64_m1_cmg14_1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %__m1_cmg14_1:1 = scf.for %offset_m1_cmg14_1 = %c0_m1_cmg14_1 to %c1024_m1_cmg14_1 step %c64_m1_cmg14_1 iter_args(%remaining_m1_cmg14_1 = %c1024_i32_m1_cmg14_1) -> (i32) { + %mask_m1_cmg14_1, %next_remaining_m1_cmg14_1 = pto.plt_b32 %remaining_m1_cmg14_1 : i32 -> !pto.mask, i32 + %vec_m1_cmg14_1 = pto.vlds %ub_in_m1_cmg14_1[%offset_m1_cmg14_1] : !pto.ptr -> !pto.vreg<64xf32> + %out_m1_cmg14_1 = pto.vcgadd %vec_m1_cmg14_1, %mask_m1_cmg14_1 : !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> + pto.vsts %out_m1_cmg14_1, %ub_out_m1_cmg14_1[%offset_m1_cmg14_1], %mask_m1_cmg14_1 : !pto.vreg<64xf32>, !pto.ptr, !pto.mask + scf.yield %next_remaining_m1_cmg14_1 : i32 + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m1_cmg14_1, %arg3, %c128_i64_m1_cmg14_1 + nburst(%c32_i64_m1_cmg14_1, %c128_i64_m1_cmg14_1, %c128_i64_m1_cmg14_1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + } + + // inactive cross-case merge from micro-op/reduction/vcgmax + scf.if %__case_merge_guard { + + %__deep_merge_guard_cmg14_2 = arith.constant false + // inactive merged from vcgmax_tie_kernel_2d + scf.if %__deep_merge_guard_cmg14_2 { + + %c0_m0_cmg14_2 = arith.constant 0 : index + %c1_m0_cmg14_2 = arith.constant 1 : index + %c64_m0_cmg14_2 = arith.constant 64 : index + %c1024_m0_cmg14_2 = arith.constant 1024 : index + %c0_i64_m0_cmg14_2 = arith.constant 0 : i64 + %c1_i64_m0_cmg14_2 = arith.constant 1 : i64 + %c32_i64_m0_cmg14_2 = arith.constant 32 : i64 + %c128_i64_m0_cmg14_2 = arith.constant 128 : i64 + %c4096_i64_m0_cmg14_2 = arith.constant 4096 : i64 + %c1024_i32_m0_cmg14_2 = arith.constant 1024 : i32 + + %ub_in_m0_cmg14_2 = pto.castptr %c0_i64_m0_cmg14_2 : i64 -> !pto.ptr + %ub_out_m0_cmg14_2 = pto.castptr %c4096_i64_m0_cmg14_2 : i64 -> !pto.ptr + + %false_m0_cmg14_2 = arith.constant false + pto.mte_gm_ub %arg0, %ub_in_m0_cmg14_2, %c0_i64_m0_cmg14_2, %c128_i64_m0_cmg14_2 + nburst(%c32_i64_m0_cmg14_2, %c128_i64_m0_cmg14_2, %c128_i64_m0_cmg14_2) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %__m0_cmg14_2:1 = scf.for %offset_m0_cmg14_2 = %c0_m0_cmg14_2 to %c1024_m0_cmg14_2 step %c64_m0_cmg14_2 iter_args(%remaining_m0_cmg14_2 = %c1024_i32_m0_cmg14_2) -> (i32) { + %mask_m0_cmg14_2, %next_remaining_m0_cmg14_2 = pto.plt_b32 %remaining_m0_cmg14_2 : i32 -> !pto.mask, i32 + %vec_m0_cmg14_2 = pto.vlds %ub_in_m0_cmg14_2[%offset_m0_cmg14_2] : !pto.ptr -> !pto.vreg<64xf32> + %out_m0_cmg14_2 = pto.vcgmax %vec_m0_cmg14_2, %mask_m0_cmg14_2 : !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> + pto.vsts %out_m0_cmg14_2, %ub_out_m0_cmg14_2[%offset_m0_cmg14_2], %mask_m0_cmg14_2 : !pto.vreg<64xf32>, !pto.ptr, !pto.mask + scf.yield %next_remaining_m0_cmg14_2 : i32 + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m0_cmg14_2, %arg1, %c128_i64_m0_cmg14_2 + nburst(%c32_i64_m0_cmg14_2, %c128_i64_m0_cmg14_2, %c128_i64_m0_cmg14_2) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + + } + // active merged from vcgmax_kernel_2d + + %c0_m1_cmg14_2 = arith.constant 0 : index + %c1_m1_cmg14_2 = arith.constant 1 : index + %c64_m1_cmg14_2 = arith.constant 64 : index + %c1024_m1_cmg14_2 = arith.constant 1024 : index + %c0_i64_m1_cmg14_2 = arith.constant 0 : i64 + %c1_i64_m1_cmg14_2 = arith.constant 1 : i64 + %c32_i64_m1_cmg14_2 = arith.constant 32 : i64 + %c128_i64_m1_cmg14_2 = arith.constant 128 : i64 + %c4096_i64_m1_cmg14_2 = arith.constant 4096 : i64 + %c1024_i32_m1_cmg14_2 = arith.constant 1024 : i32 + + %ub_in_m1_cmg14_2 = pto.castptr %c0_i64_m1_cmg14_2 : i64 -> !pto.ptr + %ub_out_m1_cmg14_2 = pto.castptr %c4096_i64_m1_cmg14_2 : i64 -> !pto.ptr + + %false_m1_cmg14_2 = arith.constant false + pto.mte_gm_ub %arg2, %ub_in_m1_cmg14_2, %c0_i64_m1_cmg14_2, %c128_i64_m1_cmg14_2 + nburst(%c32_i64_m1_cmg14_2, %c128_i64_m1_cmg14_2, %c128_i64_m1_cmg14_2) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %__m1_cmg14_2:1 = scf.for %offset_m1_cmg14_2 = %c0_m1_cmg14_2 to %c1024_m1_cmg14_2 step %c64_m1_cmg14_2 iter_args(%remaining_m1_cmg14_2 = %c1024_i32_m1_cmg14_2) -> (i32) { + %mask_m1_cmg14_2, %next_remaining_m1_cmg14_2 = pto.plt_b32 %remaining_m1_cmg14_2 : i32 -> !pto.mask, i32 + %vec_m1_cmg14_2 = pto.vlds %ub_in_m1_cmg14_2[%offset_m1_cmg14_2] : !pto.ptr -> !pto.vreg<64xf32> + %out_m1_cmg14_2 = pto.vcgmax %vec_m1_cmg14_2, %mask_m1_cmg14_2 : !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> + pto.vsts %out_m1_cmg14_2, %ub_out_m1_cmg14_2[%offset_m1_cmg14_2], %mask_m1_cmg14_2 : !pto.vreg<64xf32>, !pto.ptr, !pto.mask + scf.yield %next_remaining_m1_cmg14_2 : i32 + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m1_cmg14_2, %arg3, %c128_i64_m1_cmg14_2 + nburst(%c32_i64_m1_cmg14_2, %c128_i64_m1_cmg14_2, %c128_i64_m1_cmg14_2) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + } + + // inactive cross-case merge from micro-op/reduction/vcgmin + scf.if %__case_merge_guard { + + %__deep_merge_guard_cmg14_3 = arith.constant false + // inactive merged from vcgmin_tie_kernel_2d + scf.if %__deep_merge_guard_cmg14_3 { + + %c0_m0_cmg14_3 = arith.constant 0 : index + %c1_m0_cmg14_3 = arith.constant 1 : index + %c64_m0_cmg14_3 = arith.constant 64 : index + %c1024_m0_cmg14_3 = arith.constant 1024 : index + %c0_i64_m0_cmg14_3 = arith.constant 0 : i64 + %c1_i64_m0_cmg14_3 = arith.constant 1 : i64 + %c32_i64_m0_cmg14_3 = arith.constant 32 : i64 + %c128_i64_m0_cmg14_3 = arith.constant 128 : i64 + %c4096_i64_m0_cmg14_3 = arith.constant 4096 : i64 + %c1024_i32_m0_cmg14_3 = arith.constant 1024 : i32 + + %ub_in_m0_cmg14_3 = pto.castptr %c0_i64_m0_cmg14_3 : i64 -> !pto.ptr + %ub_out_m0_cmg14_3 = pto.castptr %c4096_i64_m0_cmg14_3 : i64 -> !pto.ptr + + %false_m0_cmg14_3 = arith.constant false + pto.mte_gm_ub %arg0, %ub_in_m0_cmg14_3, %c0_i64_m0_cmg14_3, %c128_i64_m0_cmg14_3 + nburst(%c32_i64_m0_cmg14_3, %c128_i64_m0_cmg14_3, %c128_i64_m0_cmg14_3) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %__m0_cmg14_3:1 = scf.for %offset_m0_cmg14_3 = %c0_m0_cmg14_3 to %c1024_m0_cmg14_3 step %c64_m0_cmg14_3 iter_args(%remaining_m0_cmg14_3 = %c1024_i32_m0_cmg14_3) -> (i32) { + %mask_m0_cmg14_3, %next_remaining_m0_cmg14_3 = pto.plt_b32 %remaining_m0_cmg14_3 : i32 -> !pto.mask, i32 + %vec_m0_cmg14_3 = pto.vlds %ub_in_m0_cmg14_3[%offset_m0_cmg14_3] : !pto.ptr -> !pto.vreg<64xf32> + %out_m0_cmg14_3 = pto.vcgmin %vec_m0_cmg14_3, %mask_m0_cmg14_3 : !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> + pto.vsts %out_m0_cmg14_3, %ub_out_m0_cmg14_3[%offset_m0_cmg14_3], %mask_m0_cmg14_3 : !pto.vreg<64xf32>, !pto.ptr, !pto.mask + scf.yield %next_remaining_m0_cmg14_3 : i32 + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m0_cmg14_3, %arg1, %c128_i64_m0_cmg14_3 + nburst(%c32_i64_m0_cmg14_3, %c128_i64_m0_cmg14_3, %c128_i64_m0_cmg14_3) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + + } + // active merged from vcgmin_kernel_2d + + %c0_m1_cmg14_3 = arith.constant 0 : index + %c1_m1_cmg14_3 = arith.constant 1 : index + %c64_m1_cmg14_3 = arith.constant 64 : index + %c1024_m1_cmg14_3 = arith.constant 1024 : index + %c0_i64_m1_cmg14_3 = arith.constant 0 : i64 + %c1_i64_m1_cmg14_3 = arith.constant 1 : i64 + %c32_i64_m1_cmg14_3 = arith.constant 32 : i64 + %c128_i64_m1_cmg14_3 = arith.constant 128 : i64 + %c4096_i64_m1_cmg14_3 = arith.constant 4096 : i64 + %c1024_i32_m1_cmg14_3 = arith.constant 1024 : i32 + + %ub_in_m1_cmg14_3 = pto.castptr %c0_i64_m1_cmg14_3 : i64 -> !pto.ptr + %ub_out_m1_cmg14_3 = pto.castptr %c4096_i64_m1_cmg14_3 : i64 -> !pto.ptr + + %false_m1_cmg14_3 = arith.constant false + pto.mte_gm_ub %arg2, %ub_in_m1_cmg14_3, %c0_i64_m1_cmg14_3, %c128_i64_m1_cmg14_3 + nburst(%c32_i64_m1_cmg14_3, %c128_i64_m1_cmg14_3, %c128_i64_m1_cmg14_3) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %__m1_cmg14_3:1 = scf.for %offset_m1_cmg14_3 = %c0_m1_cmg14_3 to %c1024_m1_cmg14_3 step %c64_m1_cmg14_3 iter_args(%remaining_m1_cmg14_3 = %c1024_i32_m1_cmg14_3) -> (i32) { + %mask_m1_cmg14_3, %next_remaining_m1_cmg14_3 = pto.plt_b32 %remaining_m1_cmg14_3 : i32 -> !pto.mask, i32 + %vec_m1_cmg14_3 = pto.vlds %ub_in_m1_cmg14_3[%offset_m1_cmg14_3] : !pto.ptr -> !pto.vreg<64xf32> + %out_m1_cmg14_3 = pto.vcgmin %vec_m1_cmg14_3, %mask_m1_cmg14_3 : !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> + pto.vsts %out_m1_cmg14_3, %ub_out_m1_cmg14_3[%offset_m1_cmg14_3], %mask_m1_cmg14_3 : !pto.vreg<64xf32>, !pto.ptr, !pto.mask + scf.yield %next_remaining_m1_cmg14_3 : i32 + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m1_cmg14_3, %arg3, %c128_i64_m1_cmg14_3 + nburst(%c32_i64_m1_cmg14_3, %c128_i64_m1_cmg14_3, %c128_i64_m1_cmg14_3) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + } + + // inactive cross-case merge from micro-op/reduction/vcpadd + scf.if %__case_merge_guard { + + %__deep_merge_guard_cmg14_4 = arith.constant false + // inactive merged from vcpadd_tail_kernel_2d + scf.if %__deep_merge_guard_cmg14_4 { + + %c0_m0_cmg14_4 = arith.constant 0 : index + %c1_m0_cmg14_4 = arith.constant 1 : index + %c64_m0_cmg14_4 = arith.constant 64 : index + %c1024_m0_cmg14_4 = arith.constant 1024 : index + %c0_i64_m0_cmg14_4 = arith.constant 0 : i64 + %c1_i64_m0_cmg14_4 = arith.constant 1 : i64 + %c32_i64_m0_cmg14_4 = arith.constant 32 : i64 + %c128_i64_m0_cmg14_4 = arith.constant 128 : i64 + %c4096_i64_m0_cmg14_4 = arith.constant 4096 : i64 + %c1000_i32_m0_cmg14_4 = arith.constant 1000 : i32 + + %ub_in_m0_cmg14_4 = pto.castptr %c0_i64_m0_cmg14_4 : i64 -> !pto.ptr + %ub_out_m0_cmg14_4 = pto.castptr %c4096_i64_m0_cmg14_4 : i64 -> !pto.ptr + + %false_m0_cmg14_4 = arith.constant false + pto.mte_gm_ub %arg0, %ub_in_m0_cmg14_4, %c0_i64_m0_cmg14_4, %c128_i64_m0_cmg14_4 + nburst(%c32_i64_m0_cmg14_4, %c128_i64_m0_cmg14_4, %c128_i64_m0_cmg14_4) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %__m0_cmg14_4:1 = scf.for %offset_m0_cmg14_4 = %c0_m0_cmg14_4 to %c1024_m0_cmg14_4 step %c64_m0_cmg14_4 iter_args(%remaining_m0_cmg14_4 = %c1000_i32_m0_cmg14_4) -> (i32) { + %mask_m0_cmg14_4, %next_remaining_m0_cmg14_4 = pto.plt_b32 %remaining_m0_cmg14_4 : i32 -> !pto.mask, i32 + %vec_m0_cmg14_4 = pto.vlds %ub_in_m0_cmg14_4[%offset_m0_cmg14_4] : !pto.ptr -> !pto.vreg<64xf32> + %out_m0_cmg14_4 = pto.vcpadd %vec_m0_cmg14_4, %mask_m0_cmg14_4 : !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> + pto.vsts %out_m0_cmg14_4, %ub_out_m0_cmg14_4[%offset_m0_cmg14_4], %mask_m0_cmg14_4 : !pto.vreg<64xf32>, !pto.ptr, !pto.mask + scf.yield %next_remaining_m0_cmg14_4 : i32 + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m0_cmg14_4, %arg1, %c128_i64_m0_cmg14_4 + nburst(%c32_i64_m0_cmg14_4, %c128_i64_m0_cmg14_4, %c128_i64_m0_cmg14_4) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + + } + // active merged from vcpadd_kernel_2d + + %c0_m1_cmg14_4 = arith.constant 0 : index + %c1_m1_cmg14_4 = arith.constant 1 : index + %c64_m1_cmg14_4 = arith.constant 64 : index + %c1024_m1_cmg14_4 = arith.constant 1024 : index + %c0_i64_m1_cmg14_4 = arith.constant 0 : i64 + %c1_i64_m1_cmg14_4 = arith.constant 1 : i64 + %c32_i64_m1_cmg14_4 = arith.constant 32 : i64 + %c128_i64_m1_cmg14_4 = arith.constant 128 : i64 + %c4096_i64_m1_cmg14_4 = arith.constant 4096 : i64 + %c1024_i32_m1_cmg14_4 = arith.constant 1024 : i32 + + %ub_in_m1_cmg14_4 = pto.castptr %c0_i64_m1_cmg14_4 : i64 -> !pto.ptr + %ub_out_m1_cmg14_4 = pto.castptr %c4096_i64_m1_cmg14_4 : i64 -> !pto.ptr + + %false_m1_cmg14_4 = arith.constant false + pto.mte_gm_ub %arg2, %ub_in_m1_cmg14_4, %c0_i64_m1_cmg14_4, %c128_i64_m1_cmg14_4 + nburst(%c32_i64_m1_cmg14_4, %c128_i64_m1_cmg14_4, %c128_i64_m1_cmg14_4) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %__m1_cmg14_4:1 = scf.for %offset_m1_cmg14_4 = %c0_m1_cmg14_4 to %c1024_m1_cmg14_4 step %c64_m1_cmg14_4 iter_args(%remaining_m1_cmg14_4 = %c1024_i32_m1_cmg14_4) -> (i32) { + %mask_m1_cmg14_4, %next_remaining_m1_cmg14_4 = pto.plt_b32 %remaining_m1_cmg14_4 : i32 -> !pto.mask, i32 + %vec_m1_cmg14_4 = pto.vlds %ub_in_m1_cmg14_4[%offset_m1_cmg14_4] : !pto.ptr -> !pto.vreg<64xf32> + %out_m1_cmg14_4 = pto.vcpadd %vec_m1_cmg14_4, %mask_m1_cmg14_4 : !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> + pto.vsts %out_m1_cmg14_4, %ub_out_m1_cmg14_4[%offset_m1_cmg14_4], %mask_m1_cmg14_4 : !pto.vreg<64xf32>, !pto.ptr, !pto.mask + scf.yield %next_remaining_m1_cmg14_4 : i32 + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m1_cmg14_4, %arg3, %c128_i64_m1_cmg14_4 + nburst(%c32_i64_m1_cmg14_4, %c128_i64_m1_cmg14_4, %c128_i64_m1_cmg14_4) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + } + return } } diff --git a/test/vpto/cases/micro-op/reduction/vcadd/launch.cpp b/test/vpto/cases/micro-op/reduction/vcadd/launch.cpp index 9002bcd67..8ad505eff 100644 --- a/test/vpto/cases/micro-op/reduction/vcadd/launch.cpp +++ b/test/vpto/cases/micro-op/reduction/vcadd/launch.cpp @@ -5,22 +5,9 @@ // THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, // INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. // See LICENSE in the root of the software repository for the full text of the License. - -// --------------------------------------------------------------------------- -// PTOAS compatibility layer -// -// The upstream pto-isa headers reference some FP8/FP4 types and the -// __VEC_SCOPE__ marker that are not available on every AICore arch/toolchain -// combination (e.g. __NPU_ARCH__==2201). -// -// For our PTOAS-generated kernels we don't rely on these types today, but the -// headers still mention them in templates/static_asserts. Provide minimal -// fallbacks to keep compilation working on dav-c220. -// --------------------------------------------------------------------------- #ifndef __VEC_SCOPE__ #define __VEC_SCOPE__ #endif - #if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) typedef struct { unsigned char v; } hifloat8_t; typedef struct { unsigned char v; } float8_e4m3_t; @@ -30,33 +17,26 @@ typedef struct { unsigned char v; } float4_e1m2x2_t; typedef struct { unsigned char v; } float4_e2m1x2_t; #endif #include - -// AICore printf support is gated behind `--cce-enable-print` on some -// toolchains. When enabled, include the CCE print header so `cce::printf` -// resolves in device compilation. #if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) #include #endif - -// Some PTO-ISA types are only available in the __CCE_AICORE__ compilation -// path, but `bisheng -xcce` still performs a host-side parse pass. -// Provide minimal fallbacks only when the corresponding header wasn't -// pulled in by the selected arch implementation. #if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; +struct MrgSortExecutedNumList { uint16_t mrgSortList0,mrgSortList1,mrgSortList2,mrgSortList3; }; #endif #ifndef __CPU_SIM #include "acl/acl.h" #endif -extern "C" __global__ [aicore] void vabs_kernel_2d(__gm__ float *v1, - __gm__ float *v2); +extern "C" __global__ [aicore] void vcadd_deep_merged_kernel( + __gm__ float * arg0, + __gm__ float * arg1, + __gm__ float * arg2, + __gm__ float * arg3); -void LaunchVabs_kernel_2d(float *v1, float *v2, void *stream) { - vabs_kernel_2d<<<1, nullptr, stream>>>((__gm__ float *)v1, (__gm__ float *)v2); +void LaunchVcaddDeepMerged(float * p0, float * p1, void *stream) { + vcadd_deep_merged_kernel<<<1, nullptr, stream>>>( + (__gm__ float *)p0, + (__gm__ float *)p0, + (__gm__ float *)p0, + (__gm__ float *)p1); } diff --git a/test/vpto/cases/micro-op/reduction/vcadd/main.cpp b/test/vpto/cases/micro-op/reduction/vcadd/main.cpp index 29454461f..b0e743584 100644 --- a/test/vpto/cases/micro-op/reduction/vcadd/main.cpp +++ b/test/vpto/cases/micro-op/reduction/vcadd/main.cpp @@ -47,8 +47,8 @@ struct MrgSortExecutedNumList { } \ } while (0) -void LaunchVabs_kernel_2d(float *v1, float *v2, void *stream); +void LaunchVcaddDeepMerged(float * p0, float * p1, void *stream); int main() { size_t elemCount_v1 = 1024; size_t fileSize_v1 = elemCount_v1 * sizeof(float); @@ -83,7 +83,7 @@ int main() { ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, ACL_MEMCPY_HOST_TO_DEVICE)); ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVabs_kernel_2d(v1Device, v2Device, stream); + LaunchVcaddDeepMerged(v1Device, v2Device, stream); ACL_CHECK(aclrtSynchronizeStream(stream)); ACL_CHECK(aclrtMemcpy(v2Host, fileSize_v2, v2Device, fileSize_v2, ACL_MEMCPY_DEVICE_TO_HOST)); diff --git a/test/vpto/cases/micro-op/reduction/vcgadd-tail/compare.py b/test/vpto/cases/micro-op/reduction/vcgadd-tail/compare.py deleted file mode 100755 index fb57e856a..000000000 --- a/test/vpto/cases/micro-op/reduction/vcgadd-tail/compare.py +++ /dev/null @@ -1,211 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/reduction/vcgadd-tail -# family: reduction -# target_ops: pto.vcgadd -# scenarios: group-reduction, tail-mask, result-placement -# NOTE: bulk-generated coverage skeleton. -# coding=utf-8 - -import os -import sys -import numpy as np - - -REPEAT_BYTES = 256 -LOGICAL_ELEMS = 1000 - - -def _ceil_div(x, y): - return (x + y - 1) // y - - -def _packed_pred_storage_bytes(logical_elems, src_elem_bytes): - logical_elems = int(logical_elems) - src_elem_bytes = int(src_elem_bytes) - if logical_elems <= 0: - raise ValueError(f"logical_elems must be > 0, got {logical_elems}") - if src_elem_bytes not in (1, 2, 4): - raise ValueError(f"unsupported packed predicate source size: {src_elem_bytes}") - - repeat_elems = REPEAT_BYTES // src_elem_bytes - if src_elem_bytes == 4: - repeat_times = _ceil_div(logical_elems, repeat_elems) + 1 - loop_count = repeat_times // 2 - return loop_count * 16 - - repeat_times = _ceil_div(logical_elems, repeat_elems) - return repeat_times * (repeat_elems // 8) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - g = golden.astype(np.int64, copy=False) - o = output.astype(np.int64, copy=False) - else: - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - abs_diff = np.abs(g - o) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={g[idx]}, out={o[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_bin_prefix(golden_path, output_path, dtype, eps, count): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - try: - count = int(count) - except Exception: - print(f"[ERROR] Invalid prefix count: {count}") - return False - if count <= 0: - print(f"[ERROR] Invalid prefix count: {count}") - return False - - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np, count=count) - output = np.fromfile(output_path, dtype=dtype_np, count=count) - - if golden.size != count or output.size != count: - print( - f"[ERROR] Prefix read too small: need={count} elems, " - f"golden={golden.size}, out={output.size}" - ) - return False - - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - g = golden.astype(np.int64, copy=False) - o = output.astype(np.int64, copy=False) - else: - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - abs_diff = np.abs(g - o) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch (prefix): {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={g[idx]}, out={o[idx]}, dtype={dtype_np}, count={count})" - ) - else: - print(f"[ERROR] Mismatch (prefix): {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, logical_elems, src_elem_bytes): - """ - Compare outputs of pto.tcmp / pto.tcmps. - - PTO-ISA stores packed predicate results as a linear PK byte stream via - `psts`, with the exact written prefix length determined by the typed - TCMP/TCMPS repeat schedule. Compare only that semantic prefix. - """ - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - try: - logical_elems = int(logical_elems) - src_elem_bytes = int(src_elem_bytes) - except Exception: - print( - "[ERROR] Invalid packed mask compare arguments: " - f"logical_elems={logical_elems} src_elem_bytes={src_elem_bytes}" - ) - return False - if logical_elems <= 0 or src_elem_bytes <= 0: - print( - "[ERROR] Invalid packed mask compare arguments: " - f"logical_elems={logical_elems} src_elem_bytes={src_elem_bytes}" - ) - return False - - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - try: - prefix_bytes = _packed_pred_storage_bytes(logical_elems, src_elem_bytes) - except ValueError as exc: - print(f"[ERROR] {exc}") - return False - - if golden.size < prefix_bytes or output.size < prefix_bytes: - print( - f"[ERROR] Packed mask buffer too small: need={prefix_bytes} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - - golden_sel = golden[:prefix_bytes] - output_sel = output[:prefix_bytes] - - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, " - f"logical_elems={logical_elems}, src_elem_bytes={src_elem_bytes}, prefix_bytes={prefix_bytes})" - ) - return False - return True - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = True - ok = compare_bin_prefix("golden_v2.bin", "v2.bin", np.float32, 0.0001, - LOGICAL_ELEMS) and ok - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/reduction/vcgadd-tail/golden.py b/test/vpto/cases/micro-op/reduction/vcgadd-tail/golden.py deleted file mode 100755 index 282927eff..000000000 --- a/test/vpto/cases/micro-op/reduction/vcgadd-tail/golden.py +++ /dev/null @@ -1,54 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -ROWS = 32 -COLS = 32 -SEED = 19 -LANES = 64 -LOGICAL_ELEMS = 1000 -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) - v1 = rng.uniform(-8.0, 8.0, size=(ROWS, COLS)).astype(np.float32) - - v2 = np.zeros((ROWS, COLS), dtype=np.float32) - golden_v2 = np.zeros((ROWS, COLS), dtype=np.float32) - flat_in = v1.reshape(-1) - flat_out = golden_v2.reshape(-1) - group_elems = 8 - for offset in range(0, LOGICAL_ELEMS, LANES): - chunk = flat_in[offset:min(offset + LANES, LOGICAL_ELEMS)] - for gi, group in enumerate(range(0, chunk.size, group_elems)): - # VCGADD writes one reduced value per 32B block continuously to low lanes. - flat_out[offset + gi] = np.sum(chunk[group:group + group_elems], dtype=np.float32) - - - output_dir.mkdir(parents=True, exist_ok=True) - v1.reshape(-1).tofile(output_dir / "v1.bin") - v2.reshape(-1).tofile(output_dir / "v2.bin") - golden_v2.reshape(-1).tofile(output_dir / "golden_v2.bin") - - -def main() -> None: - parser = argparse.ArgumentParser() - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/reduction/vcgadd-tail/kernel.pto b/test/vpto/cases/micro-op/reduction/vcgadd-tail/kernel.pto deleted file mode 100644 index 34122605e..000000000 --- a/test/vpto/cases/micro-op/reduction/vcgadd-tail/kernel.pto +++ /dev/null @@ -1,51 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/reduction/vcgadd-tail -// family: reduction -// target_ops: pto.vcgadd -// scenarios: group-reduction, tail-mask, result-placement -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vcgadd_tail_kernel_2d(%arg0: !pto.ptr, %arg1: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c64 = arith.constant 64 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c128_i64 = arith.constant 128 : i64 - %c4096_i64 = arith.constant 4096 : i64 - %c1000_i32 = arith.constant 1000 : i32 - - %ub_in = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c4096_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_in, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %_:1 = scf.for %offset = %c0 to %c1024 step %c64 iter_args(%remaining = %c1000_i32) -> (i32) { - %mask, %next_remaining = pto.plt_b32 %remaining : i32 -> !pto.mask, i32 - %vec = pto.vlds %ub_in[%offset] : !pto.ptr -> !pto.vreg<64xf32> - %out = pto.vcgadd %vec, %mask : !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> - pto.vsts %out, %ub_out[%offset], %mask : !pto.vreg<64xf32>, !pto.ptr, !pto.mask - scf.yield %next_remaining : i32 - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg1, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/reduction/vcgadd-tail/launch.cpp b/test/vpto/cases/micro-op/reduction/vcgadd-tail/launch.cpp deleted file mode 100644 index e35c2b363..000000000 --- a/test/vpto/cases/micro-op/reduction/vcgadd-tail/launch.cpp +++ /dev/null @@ -1,70 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/reduction/vcgadd-tail -// family: reduction -// target_ops: pto.vcgadd -// scenarios: group-reduction, tail-mask, result-placement -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -// --------------------------------------------------------------------------- -// PTOAS compatibility layer -// -// The upstream pto-isa headers reference some FP8/FP4 types and the -// __VEC_SCOPE__ marker that are not available on every AICore arch/toolchain -// combination (e.g. __NPU_ARCH__==2201). -// -// For our PTOAS-generated kernels we don't rely on these types today, but the -// headers still mention them in templates/static_asserts. Provide minimal -// fallbacks to keep compilation working on dav-c220. -// --------------------------------------------------------------------------- -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include - -// AICore printf support is gated behind `--cce-enable-print` on some -// toolchains. When enabled, include the CCE print header so `cce::printf` -// resolves in device compilation. -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -// Some PTO-ISA types are only available in the __CCE_AICORE__ compilation -// path, but `bisheng -xcce` still performs a host-side parse pass. -// Provide minimal fallbacks only when the corresponding header wasn't -// pulled in by the selected arch implementation. -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vcgadd_tail_kernel_2d(__gm__ float *v1, - __gm__ float *v2); - -void LaunchVcgaddTail_kernel_2d(float *v1, float *v2, void *stream) { - vcgadd_tail_kernel_2d<<<1, nullptr, stream>>>((__gm__ float *)v1, (__gm__ float *)v2); -} diff --git a/test/vpto/cases/micro-op/reduction/vcgadd-tail/main.cpp b/test/vpto/cases/micro-op/reduction/vcgadd-tail/main.cpp deleted file mode 100644 index 29bdc23bf..000000000 --- a/test/vpto/cases/micro-op/reduction/vcgadd-tail/main.cpp +++ /dev/null @@ -1,130 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/reduction/vcgadd-tail -// family: reduction -// target_ops: pto.vcgadd -// scenarios: group-reduction, tail-mask, result-placement -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -This program is free software, you can redistribute it and/or modify it under the terms and conditions of -CANN Open Software License Agreement Version 2.0 (the "License"). -Please refer to the License for details. You may not use this file except in compliance with the License. -THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -See LICENSE in the root of the software repository for the full text of the License. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include -#include - -using namespace PtoTestCommon; - -#ifndef TMRGSORT_HPP -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, (int)_ret, __FILE__, __LINE__); \ - const char *_recent = aclGetRecentErrMsg(); \ - if (_recent != nullptr && _recent[0] != '\0') { \ - std::fprintf(stderr, "[ERROR] RecentErrMsg: %s\n", _recent); \ - } \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVcgaddTail_kernel_2d(float *v1, float *v2, void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(float); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(float); - float *v1Host = nullptr; - float *v1Device = nullptr; - float *v2Host = nullptr; - float *v2Device = nullptr; - - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { - deviceId = std::atoi(envDevice); - } - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVcgaddTail_kernel_2d(v1Device, v2Device, stream); - - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v2Host, fileSize_v2, v2Device, fileSize_v2, ACL_MEMCPY_DEVICE_TO_HOST)); - - WriteFile("./v2.bin", v2Host, fileSize_v2); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - if (stream != nullptr) { - const aclError _ret = aclrtDestroyStream(stream); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtDestroyStream(stream)", (int)_ret, __FILE__, __LINE__); - } - stream = nullptr; - } - if (deviceSet) { - const aclError _ret = aclrtResetDevice(deviceId); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtResetDevice(deviceId)", (int)_ret, __FILE__, __LINE__); - } - } - if (aclInited) { - const aclError _ret = aclFinalize(); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclFinalize()", (int)_ret, __FILE__, __LINE__); - } - } - - return rc; -} diff --git a/test/vpto/cases/micro-op/reduction/vcgadd/compare.py b/test/vpto/cases/micro-op/reduction/vcgadd/compare.py deleted file mode 100755 index 2c8e5f087..000000000 --- a/test/vpto/cases/micro-op/reduction/vcgadd/compare.py +++ /dev/null @@ -1,209 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/reduction/vcgadd -# family: reduction -# target_ops: pto.vcgadd -# scenarios: group-reduction, result-placement -# NOTE: bulk-generated coverage skeleton. -# coding=utf-8 - -import os -import sys -import numpy as np - - -REPEAT_BYTES = 256 - - -def _ceil_div(x, y): - return (x + y - 1) // y - - -def _packed_pred_storage_bytes(logical_elems, src_elem_bytes): - logical_elems = int(logical_elems) - src_elem_bytes = int(src_elem_bytes) - if logical_elems <= 0: - raise ValueError(f"logical_elems must be > 0, got {logical_elems}") - if src_elem_bytes not in (1, 2, 4): - raise ValueError(f"unsupported packed predicate source size: {src_elem_bytes}") - - repeat_elems = REPEAT_BYTES // src_elem_bytes - if src_elem_bytes == 4: - repeat_times = _ceil_div(logical_elems, repeat_elems) + 1 - loop_count = repeat_times // 2 - return loop_count * 16 - - repeat_times = _ceil_div(logical_elems, repeat_elems) - return repeat_times * (repeat_elems // 8) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - g = golden.astype(np.int64, copy=False) - o = output.astype(np.int64, copy=False) - else: - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - abs_diff = np.abs(g - o) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={g[idx]}, out={o[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_bin_prefix(golden_path, output_path, dtype, eps, count): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - try: - count = int(count) - except Exception: - print(f"[ERROR] Invalid prefix count: {count}") - return False - if count <= 0: - print(f"[ERROR] Invalid prefix count: {count}") - return False - - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np, count=count) - output = np.fromfile(output_path, dtype=dtype_np, count=count) - - if golden.size != count or output.size != count: - print( - f"[ERROR] Prefix read too small: need={count} elems, " - f"golden={golden.size}, out={output.size}" - ) - return False - - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - g = golden.astype(np.int64, copy=False) - o = output.astype(np.int64, copy=False) - else: - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - abs_diff = np.abs(g - o) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch (prefix): {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={g[idx]}, out={o[idx]}, dtype={dtype_np}, count={count})" - ) - else: - print(f"[ERROR] Mismatch (prefix): {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, logical_elems, src_elem_bytes): - """ - Compare outputs of pto.tcmp / pto.tcmps. - - PTO-ISA stores packed predicate results as a linear PK byte stream via - `psts`, with the exact written prefix length determined by the typed - TCMP/TCMPS repeat schedule. Compare only that semantic prefix. - """ - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - try: - logical_elems = int(logical_elems) - src_elem_bytes = int(src_elem_bytes) - except Exception: - print( - "[ERROR] Invalid packed mask compare arguments: " - f"logical_elems={logical_elems} src_elem_bytes={src_elem_bytes}" - ) - return False - if logical_elems <= 0 or src_elem_bytes <= 0: - print( - "[ERROR] Invalid packed mask compare arguments: " - f"logical_elems={logical_elems} src_elem_bytes={src_elem_bytes}" - ) - return False - - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - try: - prefix_bytes = _packed_pred_storage_bytes(logical_elems, src_elem_bytes) - except ValueError as exc: - print(f"[ERROR] {exc}") - return False - - if golden.size < prefix_bytes or output.size < prefix_bytes: - print( - f"[ERROR] Packed mask buffer too small: need={prefix_bytes} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - - golden_sel = golden[:prefix_bytes] - output_sel = output[:prefix_bytes] - - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, " - f"logical_elems={logical_elems}, src_elem_bytes={src_elem_bytes}, prefix_bytes={prefix_bytes})" - ) - return False - return True - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = True - ok = compare_bin("golden_v2.bin", "v2.bin", np.float32, 0.0001) and ok - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/reduction/vcgadd/golden.py b/test/vpto/cases/micro-op/reduction/vcgadd/golden.py deleted file mode 100755 index efa021477..000000000 --- a/test/vpto/cases/micro-op/reduction/vcgadd/golden.py +++ /dev/null @@ -1,53 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -ROWS = 32 -COLS = 32 -SEED = 19 -LANES = 64 -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) - v1 = rng.uniform(-8.0, 8.0, size=(ROWS, COLS)).astype(np.float32) - - v2 = np.zeros((ROWS, COLS), dtype=np.float32) - golden_v2 = np.zeros((ROWS, COLS), dtype=np.float32) - flat_in = v1.reshape(-1) - flat_out = golden_v2.reshape(-1) - group_elems = 8 - for offset in range(0, flat_in.size, LANES): - chunk = flat_in[offset:offset + LANES] - for gi, group in enumerate(range(0, chunk.size, group_elems)): - # VCGADD writes one reduced value per 32B block continuously to low lanes. - flat_out[offset + gi] = np.sum(chunk[group:group + group_elems], dtype=np.float32) - - - output_dir.mkdir(parents=True, exist_ok=True) - v1.reshape(-1).tofile(output_dir / "v1.bin") - v2.reshape(-1).tofile(output_dir / "v2.bin") - golden_v2.reshape(-1).tofile(output_dir / "golden_v2.bin") - - -def main() -> None: - parser = argparse.ArgumentParser() - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/reduction/vcgadd/kernel.pto b/test/vpto/cases/micro-op/reduction/vcgadd/kernel.pto deleted file mode 100644 index 72c89af37..000000000 --- a/test/vpto/cases/micro-op/reduction/vcgadd/kernel.pto +++ /dev/null @@ -1,51 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/reduction/vcgadd -// family: reduction -// target_ops: pto.vcgadd -// scenarios: group-reduction, result-placement -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vcgadd_kernel_2d(%arg0: !pto.ptr, %arg1: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c64 = arith.constant 64 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c128_i64 = arith.constant 128 : i64 - %c4096_i64 = arith.constant 4096 : i64 - %c1024_i32 = arith.constant 1024 : i32 - - %ub_in = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c4096_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_in, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %_:1 = scf.for %offset = %c0 to %c1024 step %c64 iter_args(%remaining = %c1024_i32) -> (i32) { - %mask, %next_remaining = pto.plt_b32 %remaining : i32 -> !pto.mask, i32 - %vec = pto.vlds %ub_in[%offset] : !pto.ptr -> !pto.vreg<64xf32> - %out = pto.vcgadd %vec, %mask : !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> - pto.vsts %out, %ub_out[%offset], %mask : !pto.vreg<64xf32>, !pto.ptr, !pto.mask - scf.yield %next_remaining : i32 - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg1, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/reduction/vcgadd/launch.cpp b/test/vpto/cases/micro-op/reduction/vcgadd/launch.cpp deleted file mode 100644 index 16a1993e8..000000000 --- a/test/vpto/cases/micro-op/reduction/vcgadd/launch.cpp +++ /dev/null @@ -1,70 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/reduction/vcgadd -// family: reduction -// target_ops: pto.vcgadd -// scenarios: group-reduction, result-placement -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -// --------------------------------------------------------------------------- -// PTOAS compatibility layer -// -// The upstream pto-isa headers reference some FP8/FP4 types and the -// __VEC_SCOPE__ marker that are not available on every AICore arch/toolchain -// combination (e.g. __NPU_ARCH__==2201). -// -// For our PTOAS-generated kernels we don't rely on these types today, but the -// headers still mention them in templates/static_asserts. Provide minimal -// fallbacks to keep compilation working on dav-c220. -// --------------------------------------------------------------------------- -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include - -// AICore printf support is gated behind `--cce-enable-print` on some -// toolchains. When enabled, include the CCE print header so `cce::printf` -// resolves in device compilation. -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -// Some PTO-ISA types are only available in the __CCE_AICORE__ compilation -// path, but `bisheng -xcce` still performs a host-side parse pass. -// Provide minimal fallbacks only when the corresponding header wasn't -// pulled in by the selected arch implementation. -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vcgadd_kernel_2d(__gm__ float *v1, - __gm__ float *v2); - -void LaunchVcgadd_kernel_2d(float *v1, float *v2, void *stream) { - vcgadd_kernel_2d<<<1, nullptr, stream>>>((__gm__ float *)v1, (__gm__ float *)v2); -} diff --git a/test/vpto/cases/micro-op/reduction/vcgadd/main.cpp b/test/vpto/cases/micro-op/reduction/vcgadd/main.cpp deleted file mode 100644 index 712f0755a..000000000 --- a/test/vpto/cases/micro-op/reduction/vcgadd/main.cpp +++ /dev/null @@ -1,130 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/reduction/vcgadd -// family: reduction -// target_ops: pto.vcgadd -// scenarios: group-reduction, result-placement -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -This program is free software, you can redistribute it and/or modify it under the terms and conditions of -CANN Open Software License Agreement Version 2.0 (the "License"). -Please refer to the License for details. You may not use this file except in compliance with the License. -THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -See LICENSE in the root of the software repository for the full text of the License. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include -#include - -using namespace PtoTestCommon; - -#ifndef TMRGSORT_HPP -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, (int)_ret, __FILE__, __LINE__); \ - const char *_recent = aclGetRecentErrMsg(); \ - if (_recent != nullptr && _recent[0] != '\0') { \ - std::fprintf(stderr, "[ERROR] RecentErrMsg: %s\n", _recent); \ - } \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVcgadd_kernel_2d(float *v1, float *v2, void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(float); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(float); - float *v1Host = nullptr; - float *v1Device = nullptr; - float *v2Host = nullptr; - float *v2Device = nullptr; - - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { - deviceId = std::atoi(envDevice); - } - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVcgadd_kernel_2d(v1Device, v2Device, stream); - - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v2Host, fileSize_v2, v2Device, fileSize_v2, ACL_MEMCPY_DEVICE_TO_HOST)); - - WriteFile("./v2.bin", v2Host, fileSize_v2); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - if (stream != nullptr) { - const aclError _ret = aclrtDestroyStream(stream); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtDestroyStream(stream)", (int)_ret, __FILE__, __LINE__); - } - stream = nullptr; - } - if (deviceSet) { - const aclError _ret = aclrtResetDevice(deviceId); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtResetDevice(deviceId)", (int)_ret, __FILE__, __LINE__); - } - } - if (aclInited) { - const aclError _ret = aclFinalize(); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclFinalize()", (int)_ret, __FILE__, __LINE__); - } - } - - return rc; -} diff --git a/test/vpto/cases/micro-op/reduction/vcgmax-tie/compare.py b/test/vpto/cases/micro-op/reduction/vcgmax-tie/compare.py deleted file mode 100755 index a4a5c50c3..000000000 --- a/test/vpto/cases/micro-op/reduction/vcgmax-tie/compare.py +++ /dev/null @@ -1,209 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/reduction/vcgmax-tie -# family: reduction -# target_ops: pto.vcgmax -# scenarios: group-reduction, result-placement -# NOTE: bulk-generated coverage skeleton. -# coding=utf-8 - -import os -import sys -import numpy as np - - -REPEAT_BYTES = 256 - - -def _ceil_div(x, y): - return (x + y - 1) // y - - -def _packed_pred_storage_bytes(logical_elems, src_elem_bytes): - logical_elems = int(logical_elems) - src_elem_bytes = int(src_elem_bytes) - if logical_elems <= 0: - raise ValueError(f"logical_elems must be > 0, got {logical_elems}") - if src_elem_bytes not in (1, 2, 4): - raise ValueError(f"unsupported packed predicate source size: {src_elem_bytes}") - - repeat_elems = REPEAT_BYTES // src_elem_bytes - if src_elem_bytes == 4: - repeat_times = _ceil_div(logical_elems, repeat_elems) + 1 - loop_count = repeat_times // 2 - return loop_count * 16 - - repeat_times = _ceil_div(logical_elems, repeat_elems) - return repeat_times * (repeat_elems // 8) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - g = golden.astype(np.int64, copy=False) - o = output.astype(np.int64, copy=False) - else: - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - abs_diff = np.abs(g - o) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={g[idx]}, out={o[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_bin_prefix(golden_path, output_path, dtype, eps, count): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - try: - count = int(count) - except Exception: - print(f"[ERROR] Invalid prefix count: {count}") - return False - if count <= 0: - print(f"[ERROR] Invalid prefix count: {count}") - return False - - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np, count=count) - output = np.fromfile(output_path, dtype=dtype_np, count=count) - - if golden.size != count or output.size != count: - print( - f"[ERROR] Prefix read too small: need={count} elems, " - f"golden={golden.size}, out={output.size}" - ) - return False - - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - g = golden.astype(np.int64, copy=False) - o = output.astype(np.int64, copy=False) - else: - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - abs_diff = np.abs(g - o) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch (prefix): {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={g[idx]}, out={o[idx]}, dtype={dtype_np}, count={count})" - ) - else: - print(f"[ERROR] Mismatch (prefix): {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, logical_elems, src_elem_bytes): - """ - Compare outputs of pto.tcmp / pto.tcmps. - - PTO-ISA stores packed predicate results as a linear PK byte stream via - `psts`, with the exact written prefix length determined by the typed - TCMP/TCMPS repeat schedule. Compare only that semantic prefix. - """ - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - try: - logical_elems = int(logical_elems) - src_elem_bytes = int(src_elem_bytes) - except Exception: - print( - "[ERROR] Invalid packed mask compare arguments: " - f"logical_elems={logical_elems} src_elem_bytes={src_elem_bytes}" - ) - return False - if logical_elems <= 0 or src_elem_bytes <= 0: - print( - "[ERROR] Invalid packed mask compare arguments: " - f"logical_elems={logical_elems} src_elem_bytes={src_elem_bytes}" - ) - return False - - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - try: - prefix_bytes = _packed_pred_storage_bytes(logical_elems, src_elem_bytes) - except ValueError as exc: - print(f"[ERROR] {exc}") - return False - - if golden.size < prefix_bytes or output.size < prefix_bytes: - print( - f"[ERROR] Packed mask buffer too small: need={prefix_bytes} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - - golden_sel = golden[:prefix_bytes] - output_sel = output[:prefix_bytes] - - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, " - f"logical_elems={logical_elems}, src_elem_bytes={src_elem_bytes}, prefix_bytes={prefix_bytes})" - ) - return False - return True - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = True - ok = compare_bin("golden_v2.bin", "v2.bin", np.float32, 0.0001) and ok - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/reduction/vcgmax-tie/golden.py b/test/vpto/cases/micro-op/reduction/vcgmax-tie/golden.py deleted file mode 100755 index a4d414312..000000000 --- a/test/vpto/cases/micro-op/reduction/vcgmax-tie/golden.py +++ /dev/null @@ -1,58 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -ROWS = 32 -COLS = 32 -SEED = 19 -LANES = 64 -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) - v1 = rng.uniform(-8.0, 8.0, size=(ROWS, COLS)).astype(np.float32) - flat_seed = v1.reshape(-1) - for offset in range(0, flat_seed.size, LANES): - for group in range(0, LANES, 8): - base = offset + group - flat_seed[base:base + 8] = np.array([7.0, 7.0, -3.0, 1.0, 0.5, -2.0, 4.0, 6.0], dtype=np.float32) - - v2 = np.zeros((ROWS, COLS), dtype=np.float32) - golden_v2 = np.zeros((ROWS, COLS), dtype=np.float32) - flat_in = v1.reshape(-1) - flat_out = golden_v2.reshape(-1) - group_elems = 8 - for offset in range(0, flat_in.size, LANES): - chunk = flat_in[offset:offset + LANES] - for gi, group in enumerate(range(0, chunk.size, group_elems)): - # VCGMAX writes one reduced value per 32B block continuously to low lanes. - flat_out[offset + gi] = np.max(chunk[group:group + group_elems]) - - - output_dir.mkdir(parents=True, exist_ok=True) - v1.reshape(-1).tofile(output_dir / "v1.bin") - v2.reshape(-1).tofile(output_dir / "v2.bin") - golden_v2.reshape(-1).tofile(output_dir / "golden_v2.bin") - - -def main() -> None: - parser = argparse.ArgumentParser() - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/reduction/vcgmax-tie/kernel.pto b/test/vpto/cases/micro-op/reduction/vcgmax-tie/kernel.pto deleted file mode 100644 index cffd43031..000000000 --- a/test/vpto/cases/micro-op/reduction/vcgmax-tie/kernel.pto +++ /dev/null @@ -1,51 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/reduction/vcgmax-tie -// family: reduction -// target_ops: pto.vcgmax -// scenarios: group-reduction, result-placement -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vcgmax_tie_kernel_2d(%arg0: !pto.ptr, %arg1: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c64 = arith.constant 64 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c128_i64 = arith.constant 128 : i64 - %c4096_i64 = arith.constant 4096 : i64 - %c1024_i32 = arith.constant 1024 : i32 - - %ub_in = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c4096_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_in, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %_:1 = scf.for %offset = %c0 to %c1024 step %c64 iter_args(%remaining = %c1024_i32) -> (i32) { - %mask, %next_remaining = pto.plt_b32 %remaining : i32 -> !pto.mask, i32 - %vec = pto.vlds %ub_in[%offset] : !pto.ptr -> !pto.vreg<64xf32> - %out = pto.vcgmax %vec, %mask : !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> - pto.vsts %out, %ub_out[%offset], %mask : !pto.vreg<64xf32>, !pto.ptr, !pto.mask - scf.yield %next_remaining : i32 - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg1, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/reduction/vcgmax-tie/launch.cpp b/test/vpto/cases/micro-op/reduction/vcgmax-tie/launch.cpp deleted file mode 100644 index 35e5a63b3..000000000 --- a/test/vpto/cases/micro-op/reduction/vcgmax-tie/launch.cpp +++ /dev/null @@ -1,70 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/reduction/vcgmax-tie -// family: reduction -// target_ops: pto.vcgmax -// scenarios: group-reduction, result-placement -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -// --------------------------------------------------------------------------- -// PTOAS compatibility layer -// -// The upstream pto-isa headers reference some FP8/FP4 types and the -// __VEC_SCOPE__ marker that are not available on every AICore arch/toolchain -// combination (e.g. __NPU_ARCH__==2201). -// -// For our PTOAS-generated kernels we don't rely on these types today, but the -// headers still mention them in templates/static_asserts. Provide minimal -// fallbacks to keep compilation working on dav-c220. -// --------------------------------------------------------------------------- -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include - -// AICore printf support is gated behind `--cce-enable-print` on some -// toolchains. When enabled, include the CCE print header so `cce::printf` -// resolves in device compilation. -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -// Some PTO-ISA types are only available in the __CCE_AICORE__ compilation -// path, but `bisheng -xcce` still performs a host-side parse pass. -// Provide minimal fallbacks only when the corresponding header wasn't -// pulled in by the selected arch implementation. -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vcgmax_tie_kernel_2d(__gm__ float *v1, - __gm__ float *v2); - -void LaunchVcgmaxTie_kernel_2d(float *v1, float *v2, void *stream) { - vcgmax_tie_kernel_2d<<<1, nullptr, stream>>>((__gm__ float *)v1, (__gm__ float *)v2); -} diff --git a/test/vpto/cases/micro-op/reduction/vcgmax-tie/main.cpp b/test/vpto/cases/micro-op/reduction/vcgmax-tie/main.cpp deleted file mode 100644 index 79ff13b2e..000000000 --- a/test/vpto/cases/micro-op/reduction/vcgmax-tie/main.cpp +++ /dev/null @@ -1,130 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/reduction/vcgmax-tie -// family: reduction -// target_ops: pto.vcgmax -// scenarios: group-reduction, result-placement -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -This program is free software, you can redistribute it and/or modify it under the terms and conditions of -CANN Open Software License Agreement Version 2.0 (the "License"). -Please refer to the License for details. You may not use this file except in compliance with the License. -THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -See LICENSE in the root of the software repository for the full text of the License. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include -#include - -using namespace PtoTestCommon; - -#ifndef TMRGSORT_HPP -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, (int)_ret, __FILE__, __LINE__); \ - const char *_recent = aclGetRecentErrMsg(); \ - if (_recent != nullptr && _recent[0] != '\0') { \ - std::fprintf(stderr, "[ERROR] RecentErrMsg: %s\n", _recent); \ - } \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVcgmaxTie_kernel_2d(float *v1, float *v2, void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(float); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(float); - float *v1Host = nullptr; - float *v1Device = nullptr; - float *v2Host = nullptr; - float *v2Device = nullptr; - - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { - deviceId = std::atoi(envDevice); - } - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVcgmaxTie_kernel_2d(v1Device, v2Device, stream); - - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v2Host, fileSize_v2, v2Device, fileSize_v2, ACL_MEMCPY_DEVICE_TO_HOST)); - - WriteFile("./v2.bin", v2Host, fileSize_v2); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - if (stream != nullptr) { - const aclError _ret = aclrtDestroyStream(stream); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtDestroyStream(stream)", (int)_ret, __FILE__, __LINE__); - } - stream = nullptr; - } - if (deviceSet) { - const aclError _ret = aclrtResetDevice(deviceId); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtResetDevice(deviceId)", (int)_ret, __FILE__, __LINE__); - } - } - if (aclInited) { - const aclError _ret = aclFinalize(); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclFinalize()", (int)_ret, __FILE__, __LINE__); - } - } - - return rc; -} diff --git a/test/vpto/cases/micro-op/reduction/vcgmax/compare.py b/test/vpto/cases/micro-op/reduction/vcgmax/compare.py deleted file mode 100755 index f1f037986..000000000 --- a/test/vpto/cases/micro-op/reduction/vcgmax/compare.py +++ /dev/null @@ -1,209 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/reduction/vcgmax -# family: reduction -# target_ops: pto.vcgmax -# scenarios: group-reduction, result-placement -# NOTE: bulk-generated coverage skeleton. -# coding=utf-8 - -import os -import sys -import numpy as np - - -REPEAT_BYTES = 256 - - -def _ceil_div(x, y): - return (x + y - 1) // y - - -def _packed_pred_storage_bytes(logical_elems, src_elem_bytes): - logical_elems = int(logical_elems) - src_elem_bytes = int(src_elem_bytes) - if logical_elems <= 0: - raise ValueError(f"logical_elems must be > 0, got {logical_elems}") - if src_elem_bytes not in (1, 2, 4): - raise ValueError(f"unsupported packed predicate source size: {src_elem_bytes}") - - repeat_elems = REPEAT_BYTES // src_elem_bytes - if src_elem_bytes == 4: - repeat_times = _ceil_div(logical_elems, repeat_elems) + 1 - loop_count = repeat_times // 2 - return loop_count * 16 - - repeat_times = _ceil_div(logical_elems, repeat_elems) - return repeat_times * (repeat_elems // 8) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - g = golden.astype(np.int64, copy=False) - o = output.astype(np.int64, copy=False) - else: - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - abs_diff = np.abs(g - o) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={g[idx]}, out={o[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_bin_prefix(golden_path, output_path, dtype, eps, count): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - try: - count = int(count) - except Exception: - print(f"[ERROR] Invalid prefix count: {count}") - return False - if count <= 0: - print(f"[ERROR] Invalid prefix count: {count}") - return False - - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np, count=count) - output = np.fromfile(output_path, dtype=dtype_np, count=count) - - if golden.size != count or output.size != count: - print( - f"[ERROR] Prefix read too small: need={count} elems, " - f"golden={golden.size}, out={output.size}" - ) - return False - - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - g = golden.astype(np.int64, copy=False) - o = output.astype(np.int64, copy=False) - else: - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - abs_diff = np.abs(g - o) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch (prefix): {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={g[idx]}, out={o[idx]}, dtype={dtype_np}, count={count})" - ) - else: - print(f"[ERROR] Mismatch (prefix): {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, logical_elems, src_elem_bytes): - """ - Compare outputs of pto.tcmp / pto.tcmps. - - PTO-ISA stores packed predicate results as a linear PK byte stream via - `psts`, with the exact written prefix length determined by the typed - TCMP/TCMPS repeat schedule. Compare only that semantic prefix. - """ - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - try: - logical_elems = int(logical_elems) - src_elem_bytes = int(src_elem_bytes) - except Exception: - print( - "[ERROR] Invalid packed mask compare arguments: " - f"logical_elems={logical_elems} src_elem_bytes={src_elem_bytes}" - ) - return False - if logical_elems <= 0 or src_elem_bytes <= 0: - print( - "[ERROR] Invalid packed mask compare arguments: " - f"logical_elems={logical_elems} src_elem_bytes={src_elem_bytes}" - ) - return False - - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - try: - prefix_bytes = _packed_pred_storage_bytes(logical_elems, src_elem_bytes) - except ValueError as exc: - print(f"[ERROR] {exc}") - return False - - if golden.size < prefix_bytes or output.size < prefix_bytes: - print( - f"[ERROR] Packed mask buffer too small: need={prefix_bytes} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - - golden_sel = golden[:prefix_bytes] - output_sel = output[:prefix_bytes] - - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, " - f"logical_elems={logical_elems}, src_elem_bytes={src_elem_bytes}, prefix_bytes={prefix_bytes})" - ) - return False - return True - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = True - ok = compare_bin("golden_v2.bin", "v2.bin", np.float32, 0.0001) and ok - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/reduction/vcgmax/golden.py b/test/vpto/cases/micro-op/reduction/vcgmax/golden.py deleted file mode 100755 index d807ff1e0..000000000 --- a/test/vpto/cases/micro-op/reduction/vcgmax/golden.py +++ /dev/null @@ -1,53 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -ROWS = 32 -COLS = 32 -SEED = 19 -LANES = 64 -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) - v1 = rng.uniform(-8.0, 8.0, size=(ROWS, COLS)).astype(np.float32) - - v2 = np.zeros((ROWS, COLS), dtype=np.float32) - golden_v2 = np.zeros((ROWS, COLS), dtype=np.float32) - flat_in = v1.reshape(-1) - flat_out = golden_v2.reshape(-1) - group_elems = 8 - for offset in range(0, flat_in.size, LANES): - chunk = flat_in[offset:offset + LANES] - for gi, group in enumerate(range(0, chunk.size, group_elems)): - # VCGMAX writes one reduced value per 32B block continuously to low lanes. - flat_out[offset + gi] = np.max(chunk[group:group + group_elems]) - - - output_dir.mkdir(parents=True, exist_ok=True) - v1.reshape(-1).tofile(output_dir / "v1.bin") - v2.reshape(-1).tofile(output_dir / "v2.bin") - golden_v2.reshape(-1).tofile(output_dir / "golden_v2.bin") - - -def main() -> None: - parser = argparse.ArgumentParser() - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/reduction/vcgmax/kernel.pto b/test/vpto/cases/micro-op/reduction/vcgmax/kernel.pto deleted file mode 100644 index 12f289720..000000000 --- a/test/vpto/cases/micro-op/reduction/vcgmax/kernel.pto +++ /dev/null @@ -1,51 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/reduction/vcgmax -// family: reduction -// target_ops: pto.vcgmax -// scenarios: group-reduction, result-placement -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vcgmax_kernel_2d(%arg0: !pto.ptr, %arg1: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c64 = arith.constant 64 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c128_i64 = arith.constant 128 : i64 - %c4096_i64 = arith.constant 4096 : i64 - %c1024_i32 = arith.constant 1024 : i32 - - %ub_in = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c4096_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_in, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %_:1 = scf.for %offset = %c0 to %c1024 step %c64 iter_args(%remaining = %c1024_i32) -> (i32) { - %mask, %next_remaining = pto.plt_b32 %remaining : i32 -> !pto.mask, i32 - %vec = pto.vlds %ub_in[%offset] : !pto.ptr -> !pto.vreg<64xf32> - %out = pto.vcgmax %vec, %mask : !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> - pto.vsts %out, %ub_out[%offset], %mask : !pto.vreg<64xf32>, !pto.ptr, !pto.mask - scf.yield %next_remaining : i32 - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg1, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/reduction/vcgmax/launch.cpp b/test/vpto/cases/micro-op/reduction/vcgmax/launch.cpp deleted file mode 100644 index 33855f496..000000000 --- a/test/vpto/cases/micro-op/reduction/vcgmax/launch.cpp +++ /dev/null @@ -1,70 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/reduction/vcgmax -// family: reduction -// target_ops: pto.vcgmax -// scenarios: group-reduction, result-placement -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -// --------------------------------------------------------------------------- -// PTOAS compatibility layer -// -// The upstream pto-isa headers reference some FP8/FP4 types and the -// __VEC_SCOPE__ marker that are not available on every AICore arch/toolchain -// combination (e.g. __NPU_ARCH__==2201). -// -// For our PTOAS-generated kernels we don't rely on these types today, but the -// headers still mention them in templates/static_asserts. Provide minimal -// fallbacks to keep compilation working on dav-c220. -// --------------------------------------------------------------------------- -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include - -// AICore printf support is gated behind `--cce-enable-print` on some -// toolchains. When enabled, include the CCE print header so `cce::printf` -// resolves in device compilation. -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -// Some PTO-ISA types are only available in the __CCE_AICORE__ compilation -// path, but `bisheng -xcce` still performs a host-side parse pass. -// Provide minimal fallbacks only when the corresponding header wasn't -// pulled in by the selected arch implementation. -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vcgmax_kernel_2d(__gm__ float *v1, - __gm__ float *v2); - -void LaunchVcgmax_kernel_2d(float *v1, float *v2, void *stream) { - vcgmax_kernel_2d<<<1, nullptr, stream>>>((__gm__ float *)v1, (__gm__ float *)v2); -} diff --git a/test/vpto/cases/micro-op/reduction/vcgmax/main.cpp b/test/vpto/cases/micro-op/reduction/vcgmax/main.cpp deleted file mode 100644 index f51aa0ebe..000000000 --- a/test/vpto/cases/micro-op/reduction/vcgmax/main.cpp +++ /dev/null @@ -1,130 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/reduction/vcgmax -// family: reduction -// target_ops: pto.vcgmax -// scenarios: group-reduction, result-placement -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -This program is free software, you can redistribute it and/or modify it under the terms and conditions of -CANN Open Software License Agreement Version 2.0 (the "License"). -Please refer to the License for details. You may not use this file except in compliance with the License. -THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -See LICENSE in the root of the software repository for the full text of the License. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include -#include - -using namespace PtoTestCommon; - -#ifndef TMRGSORT_HPP -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, (int)_ret, __FILE__, __LINE__); \ - const char *_recent = aclGetRecentErrMsg(); \ - if (_recent != nullptr && _recent[0] != '\0') { \ - std::fprintf(stderr, "[ERROR] RecentErrMsg: %s\n", _recent); \ - } \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVcgmax_kernel_2d(float *v1, float *v2, void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(float); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(float); - float *v1Host = nullptr; - float *v1Device = nullptr; - float *v2Host = nullptr; - float *v2Device = nullptr; - - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { - deviceId = std::atoi(envDevice); - } - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVcgmax_kernel_2d(v1Device, v2Device, stream); - - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v2Host, fileSize_v2, v2Device, fileSize_v2, ACL_MEMCPY_DEVICE_TO_HOST)); - - WriteFile("./v2.bin", v2Host, fileSize_v2); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - if (stream != nullptr) { - const aclError _ret = aclrtDestroyStream(stream); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtDestroyStream(stream)", (int)_ret, __FILE__, __LINE__); - } - stream = nullptr; - } - if (deviceSet) { - const aclError _ret = aclrtResetDevice(deviceId); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtResetDevice(deviceId)", (int)_ret, __FILE__, __LINE__); - } - } - if (aclInited) { - const aclError _ret = aclFinalize(); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclFinalize()", (int)_ret, __FILE__, __LINE__); - } - } - - return rc; -} diff --git a/test/vpto/cases/micro-op/reduction/vcgmin-tie/compare.py b/test/vpto/cases/micro-op/reduction/vcgmin-tie/compare.py deleted file mode 100755 index 05b8ee45c..000000000 --- a/test/vpto/cases/micro-op/reduction/vcgmin-tie/compare.py +++ /dev/null @@ -1,209 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/reduction/vcgmin-tie -# family: reduction -# target_ops: pto.vcgmin -# scenarios: group-reduction, result-placement -# NOTE: bulk-generated coverage skeleton. -# coding=utf-8 - -import os -import sys -import numpy as np - - -REPEAT_BYTES = 256 - - -def _ceil_div(x, y): - return (x + y - 1) // y - - -def _packed_pred_storage_bytes(logical_elems, src_elem_bytes): - logical_elems = int(logical_elems) - src_elem_bytes = int(src_elem_bytes) - if logical_elems <= 0: - raise ValueError(f"logical_elems must be > 0, got {logical_elems}") - if src_elem_bytes not in (1, 2, 4): - raise ValueError(f"unsupported packed predicate source size: {src_elem_bytes}") - - repeat_elems = REPEAT_BYTES // src_elem_bytes - if src_elem_bytes == 4: - repeat_times = _ceil_div(logical_elems, repeat_elems) + 1 - loop_count = repeat_times // 2 - return loop_count * 16 - - repeat_times = _ceil_div(logical_elems, repeat_elems) - return repeat_times * (repeat_elems // 8) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - g = golden.astype(np.int64, copy=False) - o = output.astype(np.int64, copy=False) - else: - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - abs_diff = np.abs(g - o) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={g[idx]}, out={o[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_bin_prefix(golden_path, output_path, dtype, eps, count): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - try: - count = int(count) - except Exception: - print(f"[ERROR] Invalid prefix count: {count}") - return False - if count <= 0: - print(f"[ERROR] Invalid prefix count: {count}") - return False - - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np, count=count) - output = np.fromfile(output_path, dtype=dtype_np, count=count) - - if golden.size != count or output.size != count: - print( - f"[ERROR] Prefix read too small: need={count} elems, " - f"golden={golden.size}, out={output.size}" - ) - return False - - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - g = golden.astype(np.int64, copy=False) - o = output.astype(np.int64, copy=False) - else: - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - abs_diff = np.abs(g - o) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch (prefix): {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={g[idx]}, out={o[idx]}, dtype={dtype_np}, count={count})" - ) - else: - print(f"[ERROR] Mismatch (prefix): {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, logical_elems, src_elem_bytes): - """ - Compare outputs of pto.tcmp / pto.tcmps. - - PTO-ISA stores packed predicate results as a linear PK byte stream via - `psts`, with the exact written prefix length determined by the typed - TCMP/TCMPS repeat schedule. Compare only that semantic prefix. - """ - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - try: - logical_elems = int(logical_elems) - src_elem_bytes = int(src_elem_bytes) - except Exception: - print( - "[ERROR] Invalid packed mask compare arguments: " - f"logical_elems={logical_elems} src_elem_bytes={src_elem_bytes}" - ) - return False - if logical_elems <= 0 or src_elem_bytes <= 0: - print( - "[ERROR] Invalid packed mask compare arguments: " - f"logical_elems={logical_elems} src_elem_bytes={src_elem_bytes}" - ) - return False - - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - try: - prefix_bytes = _packed_pred_storage_bytes(logical_elems, src_elem_bytes) - except ValueError as exc: - print(f"[ERROR] {exc}") - return False - - if golden.size < prefix_bytes or output.size < prefix_bytes: - print( - f"[ERROR] Packed mask buffer too small: need={prefix_bytes} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - - golden_sel = golden[:prefix_bytes] - output_sel = output[:prefix_bytes] - - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, " - f"logical_elems={logical_elems}, src_elem_bytes={src_elem_bytes}, prefix_bytes={prefix_bytes})" - ) - return False - return True - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = True - ok = compare_bin("golden_v2.bin", "v2.bin", np.float32, 0.0001) and ok - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/reduction/vcgmin-tie/golden.py b/test/vpto/cases/micro-op/reduction/vcgmin-tie/golden.py deleted file mode 100755 index 62a18cd0d..000000000 --- a/test/vpto/cases/micro-op/reduction/vcgmin-tie/golden.py +++ /dev/null @@ -1,58 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -ROWS = 32 -COLS = 32 -SEED = 19 -LANES = 64 -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) - v1 = rng.uniform(-8.0, 8.0, size=(ROWS, COLS)).astype(np.float32) - flat_seed = v1.reshape(-1) - for offset in range(0, flat_seed.size, LANES): - for group in range(0, LANES, 8): - base = offset + group - flat_seed[base:base + 8] = np.array([-7.0, -7.0, 3.0, -1.0, 0.5, 2.0, -4.0, -6.0], dtype=np.float32) - - v2 = np.zeros((ROWS, COLS), dtype=np.float32) - golden_v2 = np.zeros((ROWS, COLS), dtype=np.float32) - flat_in = v1.reshape(-1) - flat_out = golden_v2.reshape(-1) - group_elems = 8 - for offset in range(0, flat_in.size, LANES): - chunk = flat_in[offset:offset + LANES] - for gi, group in enumerate(range(0, chunk.size, group_elems)): - # VCGMIN writes one reduced value per 32B block continuously to low lanes. - flat_out[offset + gi] = np.min(chunk[group:group + group_elems]) - - - output_dir.mkdir(parents=True, exist_ok=True) - v1.reshape(-1).tofile(output_dir / "v1.bin") - v2.reshape(-1).tofile(output_dir / "v2.bin") - golden_v2.reshape(-1).tofile(output_dir / "golden_v2.bin") - - -def main() -> None: - parser = argparse.ArgumentParser() - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/reduction/vcgmin-tie/kernel.pto b/test/vpto/cases/micro-op/reduction/vcgmin-tie/kernel.pto deleted file mode 100644 index af0430119..000000000 --- a/test/vpto/cases/micro-op/reduction/vcgmin-tie/kernel.pto +++ /dev/null @@ -1,51 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/reduction/vcgmin-tie -// family: reduction -// target_ops: pto.vcgmin -// scenarios: group-reduction, result-placement -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vcgmin_tie_kernel_2d(%arg0: !pto.ptr, %arg1: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c64 = arith.constant 64 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c128_i64 = arith.constant 128 : i64 - %c4096_i64 = arith.constant 4096 : i64 - %c1024_i32 = arith.constant 1024 : i32 - - %ub_in = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c4096_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_in, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %_:1 = scf.for %offset = %c0 to %c1024 step %c64 iter_args(%remaining = %c1024_i32) -> (i32) { - %mask, %next_remaining = pto.plt_b32 %remaining : i32 -> !pto.mask, i32 - %vec = pto.vlds %ub_in[%offset] : !pto.ptr -> !pto.vreg<64xf32> - %out = pto.vcgmin %vec, %mask : !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> - pto.vsts %out, %ub_out[%offset], %mask : !pto.vreg<64xf32>, !pto.ptr, !pto.mask - scf.yield %next_remaining : i32 - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg1, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/reduction/vcgmin-tie/launch.cpp b/test/vpto/cases/micro-op/reduction/vcgmin-tie/launch.cpp deleted file mode 100644 index 35f95d660..000000000 --- a/test/vpto/cases/micro-op/reduction/vcgmin-tie/launch.cpp +++ /dev/null @@ -1,70 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/reduction/vcgmin-tie -// family: reduction -// target_ops: pto.vcgmin -// scenarios: group-reduction, result-placement -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -// --------------------------------------------------------------------------- -// PTOAS compatibility layer -// -// The upstream pto-isa headers reference some FP8/FP4 types and the -// __VEC_SCOPE__ marker that are not available on every AICore arch/toolchain -// combination (e.g. __NPU_ARCH__==2201). -// -// For our PTOAS-generated kernels we don't rely on these types today, but the -// headers still mention them in templates/static_asserts. Provide minimal -// fallbacks to keep compilation working on dav-c220. -// --------------------------------------------------------------------------- -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include - -// AICore printf support is gated behind `--cce-enable-print` on some -// toolchains. When enabled, include the CCE print header so `cce::printf` -// resolves in device compilation. -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -// Some PTO-ISA types are only available in the __CCE_AICORE__ compilation -// path, but `bisheng -xcce` still performs a host-side parse pass. -// Provide minimal fallbacks only when the corresponding header wasn't -// pulled in by the selected arch implementation. -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vcgmin_tie_kernel_2d(__gm__ float *v1, - __gm__ float *v2); - -void LaunchVcgminTie_kernel_2d(float *v1, float *v2, void *stream) { - vcgmin_tie_kernel_2d<<<1, nullptr, stream>>>((__gm__ float *)v1, (__gm__ float *)v2); -} diff --git a/test/vpto/cases/micro-op/reduction/vcgmin-tie/main.cpp b/test/vpto/cases/micro-op/reduction/vcgmin-tie/main.cpp deleted file mode 100644 index 3a940457b..000000000 --- a/test/vpto/cases/micro-op/reduction/vcgmin-tie/main.cpp +++ /dev/null @@ -1,130 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/reduction/vcgmin-tie -// family: reduction -// target_ops: pto.vcgmin -// scenarios: group-reduction, result-placement -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -This program is free software, you can redistribute it and/or modify it under the terms and conditions of -CANN Open Software License Agreement Version 2.0 (the "License"). -Please refer to the License for details. You may not use this file except in compliance with the License. -THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -See LICENSE in the root of the software repository for the full text of the License. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include -#include - -using namespace PtoTestCommon; - -#ifndef TMRGSORT_HPP -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, (int)_ret, __FILE__, __LINE__); \ - const char *_recent = aclGetRecentErrMsg(); \ - if (_recent != nullptr && _recent[0] != '\0') { \ - std::fprintf(stderr, "[ERROR] RecentErrMsg: %s\n", _recent); \ - } \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVcgminTie_kernel_2d(float *v1, float *v2, void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(float); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(float); - float *v1Host = nullptr; - float *v1Device = nullptr; - float *v2Host = nullptr; - float *v2Device = nullptr; - - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { - deviceId = std::atoi(envDevice); - } - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVcgminTie_kernel_2d(v1Device, v2Device, stream); - - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v2Host, fileSize_v2, v2Device, fileSize_v2, ACL_MEMCPY_DEVICE_TO_HOST)); - - WriteFile("./v2.bin", v2Host, fileSize_v2); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - if (stream != nullptr) { - const aclError _ret = aclrtDestroyStream(stream); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtDestroyStream(stream)", (int)_ret, __FILE__, __LINE__); - } - stream = nullptr; - } - if (deviceSet) { - const aclError _ret = aclrtResetDevice(deviceId); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtResetDevice(deviceId)", (int)_ret, __FILE__, __LINE__); - } - } - if (aclInited) { - const aclError _ret = aclFinalize(); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclFinalize()", (int)_ret, __FILE__, __LINE__); - } - } - - return rc; -} diff --git a/test/vpto/cases/micro-op/reduction/vcgmin/compare.py b/test/vpto/cases/micro-op/reduction/vcgmin/compare.py deleted file mode 100755 index 57ac3a528..000000000 --- a/test/vpto/cases/micro-op/reduction/vcgmin/compare.py +++ /dev/null @@ -1,209 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/reduction/vcgmin -# family: reduction -# target_ops: pto.vcgmin -# scenarios: group-reduction, result-placement -# NOTE: bulk-generated coverage skeleton. -# coding=utf-8 - -import os -import sys -import numpy as np - - -REPEAT_BYTES = 256 - - -def _ceil_div(x, y): - return (x + y - 1) // y - - -def _packed_pred_storage_bytes(logical_elems, src_elem_bytes): - logical_elems = int(logical_elems) - src_elem_bytes = int(src_elem_bytes) - if logical_elems <= 0: - raise ValueError(f"logical_elems must be > 0, got {logical_elems}") - if src_elem_bytes not in (1, 2, 4): - raise ValueError(f"unsupported packed predicate source size: {src_elem_bytes}") - - repeat_elems = REPEAT_BYTES // src_elem_bytes - if src_elem_bytes == 4: - repeat_times = _ceil_div(logical_elems, repeat_elems) + 1 - loop_count = repeat_times // 2 - return loop_count * 16 - - repeat_times = _ceil_div(logical_elems, repeat_elems) - return repeat_times * (repeat_elems // 8) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - g = golden.astype(np.int64, copy=False) - o = output.astype(np.int64, copy=False) - else: - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - abs_diff = np.abs(g - o) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={g[idx]}, out={o[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_bin_prefix(golden_path, output_path, dtype, eps, count): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - try: - count = int(count) - except Exception: - print(f"[ERROR] Invalid prefix count: {count}") - return False - if count <= 0: - print(f"[ERROR] Invalid prefix count: {count}") - return False - - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np, count=count) - output = np.fromfile(output_path, dtype=dtype_np, count=count) - - if golden.size != count or output.size != count: - print( - f"[ERROR] Prefix read too small: need={count} elems, " - f"golden={golden.size}, out={output.size}" - ) - return False - - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - g = golden.astype(np.int64, copy=False) - o = output.astype(np.int64, copy=False) - else: - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - abs_diff = np.abs(g - o) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch (prefix): {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={g[idx]}, out={o[idx]}, dtype={dtype_np}, count={count})" - ) - else: - print(f"[ERROR] Mismatch (prefix): {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, logical_elems, src_elem_bytes): - """ - Compare outputs of pto.tcmp / pto.tcmps. - - PTO-ISA stores packed predicate results as a linear PK byte stream via - `psts`, with the exact written prefix length determined by the typed - TCMP/TCMPS repeat schedule. Compare only that semantic prefix. - """ - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - try: - logical_elems = int(logical_elems) - src_elem_bytes = int(src_elem_bytes) - except Exception: - print( - "[ERROR] Invalid packed mask compare arguments: " - f"logical_elems={logical_elems} src_elem_bytes={src_elem_bytes}" - ) - return False - if logical_elems <= 0 or src_elem_bytes <= 0: - print( - "[ERROR] Invalid packed mask compare arguments: " - f"logical_elems={logical_elems} src_elem_bytes={src_elem_bytes}" - ) - return False - - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - try: - prefix_bytes = _packed_pred_storage_bytes(logical_elems, src_elem_bytes) - except ValueError as exc: - print(f"[ERROR] {exc}") - return False - - if golden.size < prefix_bytes or output.size < prefix_bytes: - print( - f"[ERROR] Packed mask buffer too small: need={prefix_bytes} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - - golden_sel = golden[:prefix_bytes] - output_sel = output[:prefix_bytes] - - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, " - f"logical_elems={logical_elems}, src_elem_bytes={src_elem_bytes}, prefix_bytes={prefix_bytes})" - ) - return False - return True - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = True - ok = compare_bin("golden_v2.bin", "v2.bin", np.float32, 0.0001) and ok - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/reduction/vcgmin/golden.py b/test/vpto/cases/micro-op/reduction/vcgmin/golden.py deleted file mode 100755 index 5f2413af5..000000000 --- a/test/vpto/cases/micro-op/reduction/vcgmin/golden.py +++ /dev/null @@ -1,53 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -ROWS = 32 -COLS = 32 -SEED = 19 -LANES = 64 -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) - v1 = rng.uniform(-8.0, 8.0, size=(ROWS, COLS)).astype(np.float32) - - v2 = np.zeros((ROWS, COLS), dtype=np.float32) - golden_v2 = np.zeros((ROWS, COLS), dtype=np.float32) - flat_in = v1.reshape(-1) - flat_out = golden_v2.reshape(-1) - group_elems = 8 - for offset in range(0, flat_in.size, LANES): - chunk = flat_in[offset:offset + LANES] - for gi, group in enumerate(range(0, chunk.size, group_elems)): - # VCGMIN writes one reduced value per 32B block continuously to low lanes. - flat_out[offset + gi] = np.min(chunk[group:group + group_elems]) - - - output_dir.mkdir(parents=True, exist_ok=True) - v1.reshape(-1).tofile(output_dir / "v1.bin") - v2.reshape(-1).tofile(output_dir / "v2.bin") - golden_v2.reshape(-1).tofile(output_dir / "golden_v2.bin") - - -def main() -> None: - parser = argparse.ArgumentParser() - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/reduction/vcgmin/kernel.pto b/test/vpto/cases/micro-op/reduction/vcgmin/kernel.pto deleted file mode 100644 index 6d7c9d14c..000000000 --- a/test/vpto/cases/micro-op/reduction/vcgmin/kernel.pto +++ /dev/null @@ -1,51 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/reduction/vcgmin -// family: reduction -// target_ops: pto.vcgmin -// scenarios: group-reduction, result-placement -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vcgmin_kernel_2d(%arg0: !pto.ptr, %arg1: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c64 = arith.constant 64 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c128_i64 = arith.constant 128 : i64 - %c4096_i64 = arith.constant 4096 : i64 - %c1024_i32 = arith.constant 1024 : i32 - - %ub_in = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c4096_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_in, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %_:1 = scf.for %offset = %c0 to %c1024 step %c64 iter_args(%remaining = %c1024_i32) -> (i32) { - %mask, %next_remaining = pto.plt_b32 %remaining : i32 -> !pto.mask, i32 - %vec = pto.vlds %ub_in[%offset] : !pto.ptr -> !pto.vreg<64xf32> - %out = pto.vcgmin %vec, %mask : !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> - pto.vsts %out, %ub_out[%offset], %mask : !pto.vreg<64xf32>, !pto.ptr, !pto.mask - scf.yield %next_remaining : i32 - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg1, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/reduction/vcgmin/launch.cpp b/test/vpto/cases/micro-op/reduction/vcgmin/launch.cpp deleted file mode 100644 index b6787415c..000000000 --- a/test/vpto/cases/micro-op/reduction/vcgmin/launch.cpp +++ /dev/null @@ -1,70 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/reduction/vcgmin -// family: reduction -// target_ops: pto.vcgmin -// scenarios: group-reduction, result-placement -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -// --------------------------------------------------------------------------- -// PTOAS compatibility layer -// -// The upstream pto-isa headers reference some FP8/FP4 types and the -// __VEC_SCOPE__ marker that are not available on every AICore arch/toolchain -// combination (e.g. __NPU_ARCH__==2201). -// -// For our PTOAS-generated kernels we don't rely on these types today, but the -// headers still mention them in templates/static_asserts. Provide minimal -// fallbacks to keep compilation working on dav-c220. -// --------------------------------------------------------------------------- -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include - -// AICore printf support is gated behind `--cce-enable-print` on some -// toolchains. When enabled, include the CCE print header so `cce::printf` -// resolves in device compilation. -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -// Some PTO-ISA types are only available in the __CCE_AICORE__ compilation -// path, but `bisheng -xcce` still performs a host-side parse pass. -// Provide minimal fallbacks only when the corresponding header wasn't -// pulled in by the selected arch implementation. -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vcgmin_kernel_2d(__gm__ float *v1, - __gm__ float *v2); - -void LaunchVcgmin_kernel_2d(float *v1, float *v2, void *stream) { - vcgmin_kernel_2d<<<1, nullptr, stream>>>((__gm__ float *)v1, (__gm__ float *)v2); -} diff --git a/test/vpto/cases/micro-op/reduction/vcgmin/main.cpp b/test/vpto/cases/micro-op/reduction/vcgmin/main.cpp deleted file mode 100644 index 1c4fc7676..000000000 --- a/test/vpto/cases/micro-op/reduction/vcgmin/main.cpp +++ /dev/null @@ -1,130 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/reduction/vcgmin -// family: reduction -// target_ops: pto.vcgmin -// scenarios: group-reduction, result-placement -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -This program is free software, you can redistribute it and/or modify it under the terms and conditions of -CANN Open Software License Agreement Version 2.0 (the "License"). -Please refer to the License for details. You may not use this file except in compliance with the License. -THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -See LICENSE in the root of the software repository for the full text of the License. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include -#include - -using namespace PtoTestCommon; - -#ifndef TMRGSORT_HPP -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, (int)_ret, __FILE__, __LINE__); \ - const char *_recent = aclGetRecentErrMsg(); \ - if (_recent != nullptr && _recent[0] != '\0') { \ - std::fprintf(stderr, "[ERROR] RecentErrMsg: %s\n", _recent); \ - } \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVcgmin_kernel_2d(float *v1, float *v2, void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(float); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(float); - float *v1Host = nullptr; - float *v1Device = nullptr; - float *v2Host = nullptr; - float *v2Device = nullptr; - - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { - deviceId = std::atoi(envDevice); - } - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVcgmin_kernel_2d(v1Device, v2Device, stream); - - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v2Host, fileSize_v2, v2Device, fileSize_v2, ACL_MEMCPY_DEVICE_TO_HOST)); - - WriteFile("./v2.bin", v2Host, fileSize_v2); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - if (stream != nullptr) { - const aclError _ret = aclrtDestroyStream(stream); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtDestroyStream(stream)", (int)_ret, __FILE__, __LINE__); - } - stream = nullptr; - } - if (deviceSet) { - const aclError _ret = aclrtResetDevice(deviceId); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtResetDevice(deviceId)", (int)_ret, __FILE__, __LINE__); - } - } - if (aclInited) { - const aclError _ret = aclFinalize(); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclFinalize()", (int)_ret, __FILE__, __LINE__); - } - } - - return rc; -} diff --git a/test/vpto/cases/micro-op/reduction/vcmax/kernel.pto b/test/vpto/cases/micro-op/reduction/vcmax/kernel.pto index 93aced72d..bead02a63 100644 --- a/test/vpto/cases/micro-op/reduction/vcmax/kernel.pto +++ b/test/vpto/cases/micro-op/reduction/vcmax/kernel.pto @@ -38,6 +38,50 @@ module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind, !pto.ptr, i64, i64, i64, i64 pto.barrier #pto.pipe + %__case_merge_guard = arith.constant false + + // inactive cross-case merge from micro-op/reduction/vcmin + scf.if %__case_merge_guard { + + %c0_cmg15_1 = arith.constant 0 : index + %c1_cmg15_1 = arith.constant 1 : index + %c64_cmg15_1 = arith.constant 64 : index + %c1024_cmg15_1 = arith.constant 1024 : index + %c0_i64_cmg15_1 = arith.constant 0 : i64 + %c1_i64_cmg15_1 = arith.constant 1 : i64 + %c32_i64_cmg15_1 = arith.constant 32 : i64 + %c128_i64_cmg15_1 = arith.constant 128 : i64 + %c4096_i64_cmg15_1 = arith.constant 4096 : i64 + %c1024_i32_cmg15_1 = arith.constant 1024 : i32 + + %ub_in_cmg15_1 = pto.castptr %c0_i64_cmg15_1 : i64 -> !pto.ptr + %ub_out_cmg15_1 = pto.castptr %c4096_i64_cmg15_1 : i64 -> !pto.ptr + + %false_cmg15_1 = arith.constant false + pto.mte_gm_ub %arg0, %ub_in_cmg15_1, %c0_i64_cmg15_1, %c128_i64_cmg15_1 + nburst(%c32_i64_cmg15_1, %c128_i64_cmg15_1, %c128_i64_cmg15_1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %__cmg15_1:1 = scf.for %offset_cmg15_1 = %c0_cmg15_1 to %c1024_cmg15_1 step %c64_cmg15_1 iter_args(%remaining_cmg15_1 = %c1024_i32_cmg15_1) -> (i32) { + %mask_cmg15_1, %next_remaining_cmg15_1 = pto.plt_b32 %remaining_cmg15_1 : i32 -> !pto.mask, i32 + %vec_cmg15_1 = pto.vlds %ub_in_cmg15_1[%offset_cmg15_1] : !pto.ptr -> !pto.vreg<64xf32> + %out_cmg15_1 = pto.vcmin %vec_cmg15_1, %mask_cmg15_1 : !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> + pto.vsts %out_cmg15_1, %ub_out_cmg15_1[%offset_cmg15_1], %mask_cmg15_1 : !pto.vreg<64xf32>, !pto.ptr, !pto.mask + scf.yield %next_remaining_cmg15_1 : i32 + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_cmg15_1, %arg1, %c128_i64_cmg15_1 + nburst(%c32_i64_cmg15_1, %c128_i64_cmg15_1, %c128_i64_cmg15_1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + } return } } diff --git a/test/vpto/cases/micro-op/reduction/vcmin/compare.py b/test/vpto/cases/micro-op/reduction/vcmin/compare.py deleted file mode 100644 index 962985a24..000000000 --- a/test/vpto/cases/micro-op/reduction/vcmin/compare.py +++ /dev/null @@ -1,204 +0,0 @@ -#!/usr/bin/python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# coding=utf-8 - -import os -import sys -import numpy as np - - -REPEAT_BYTES = 256 - - -def _ceil_div(x, y): - return (x + y - 1) // y - - -def _packed_pred_storage_bytes(logical_elems, src_elem_bytes): - logical_elems = int(logical_elems) - src_elem_bytes = int(src_elem_bytes) - if logical_elems <= 0: - raise ValueError(f"logical_elems must be > 0, got {logical_elems}") - if src_elem_bytes not in (1, 2, 4): - raise ValueError(f"unsupported packed predicate source size: {src_elem_bytes}") - - repeat_elems = REPEAT_BYTES // src_elem_bytes - if src_elem_bytes == 4: - repeat_times = _ceil_div(logical_elems, repeat_elems) + 1 - loop_count = repeat_times // 2 - return loop_count * 16 - - repeat_times = _ceil_div(logical_elems, repeat_elems) - return repeat_times * (repeat_elems // 8) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - g = golden.astype(np.int64, copy=False) - o = output.astype(np.int64, copy=False) - else: - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - abs_diff = np.abs(g - o) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={g[idx]}, out={o[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_bin_prefix(golden_path, output_path, dtype, eps, count): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - try: - count = int(count) - except Exception: - print(f"[ERROR] Invalid prefix count: {count}") - return False - if count <= 0: - print(f"[ERROR] Invalid prefix count: {count}") - return False - - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np, count=count) - output = np.fromfile(output_path, dtype=dtype_np, count=count) - - if golden.size != count or output.size != count: - print( - f"[ERROR] Prefix read too small: need={count} elems, " - f"golden={golden.size}, out={output.size}" - ) - return False - - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - g = golden.astype(np.int64, copy=False) - o = output.astype(np.int64, copy=False) - else: - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - abs_diff = np.abs(g - o) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch (prefix): {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={g[idx]}, out={o[idx]}, dtype={dtype_np}, count={count})" - ) - else: - print(f"[ERROR] Mismatch (prefix): {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, logical_elems, src_elem_bytes): - """ - Compare outputs of pto.tcmp / pto.tcmps. - - PTO-ISA stores packed predicate results as a linear PK byte stream via - `psts`, with the exact written prefix length determined by the typed - TCMP/TCMPS repeat schedule. Compare only that semantic prefix. - """ - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - try: - logical_elems = int(logical_elems) - src_elem_bytes = int(src_elem_bytes) - except Exception: - print( - "[ERROR] Invalid packed mask compare arguments: " - f"logical_elems={logical_elems} src_elem_bytes={src_elem_bytes}" - ) - return False - if logical_elems <= 0 or src_elem_bytes <= 0: - print( - "[ERROR] Invalid packed mask compare arguments: " - f"logical_elems={logical_elems} src_elem_bytes={src_elem_bytes}" - ) - return False - - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - try: - prefix_bytes = _packed_pred_storage_bytes(logical_elems, src_elem_bytes) - except ValueError as exc: - print(f"[ERROR] {exc}") - return False - - if golden.size < prefix_bytes or output.size < prefix_bytes: - print( - f"[ERROR] Packed mask buffer too small: need={prefix_bytes} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - - golden_sel = golden[:prefix_bytes] - output_sel = output[:prefix_bytes] - - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, " - f"logical_elems={logical_elems}, src_elem_bytes={src_elem_bytes}, prefix_bytes={prefix_bytes})" - ) - return False - return True - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = True - ok = compare_bin("golden_v2.bin", "v2.bin", np.float32, 0.0001) and ok - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/reduction/vcmin/golden.py b/test/vpto/cases/micro-op/reduction/vcmin/golden.py deleted file mode 100644 index bbbfe8d57..000000000 --- a/test/vpto/cases/micro-op/reduction/vcmin/golden.py +++ /dev/null @@ -1,52 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -ROWS = 32 -COLS = 32 -SEED = 19 -LANES = 64 -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) - v1 = rng.uniform(-8.0, 8.0, size=(ROWS, COLS)).astype(np.float32) - - v2 = np.zeros((ROWS, COLS), dtype=np.float32) - golden_v2 = np.zeros((ROWS, COLS), dtype=np.float32) - flat_in = v1.reshape(-1) - flat_out = golden_v2.reshape(-1) - flat_out_u32 = flat_out.view(np.uint32) - for offset in range(0, flat_in.size, LANES): - chunk = flat_in[offset:offset + LANES] - idx = int(np.argmin(chunk)) - flat_out[offset] = chunk[idx] - flat_out_u32[offset + 1] = np.uint32(idx) - - output_dir.mkdir(parents=True, exist_ok=True) - v1.reshape(-1).tofile(output_dir / "v1.bin") - v2.reshape(-1).tofile(output_dir / "v2.bin") - golden_v2.reshape(-1).tofile(output_dir / "golden_v2.bin") - - -def main() -> None: - parser = argparse.ArgumentParser() - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/reduction/vcmin/kernel.pto b/test/vpto/cases/micro-op/reduction/vcmin/kernel.pto deleted file mode 100644 index 9d0e34332..000000000 --- a/test/vpto/cases/micro-op/reduction/vcmin/kernel.pto +++ /dev/null @@ -1,43 +0,0 @@ -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vabs_kernel_2d(%arg0: !pto.ptr, %arg1: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c64 = arith.constant 64 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c128_i64 = arith.constant 128 : i64 - %c4096_i64 = arith.constant 4096 : i64 - %c1024_i32 = arith.constant 1024 : i32 - - %ub_in = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c4096_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_in, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %_:1 = scf.for %offset = %c0 to %c1024 step %c64 iter_args(%remaining = %c1024_i32) -> (i32) { - %mask, %next_remaining = pto.plt_b32 %remaining : i32 -> !pto.mask, i32 - %vec = pto.vlds %ub_in[%offset] : !pto.ptr -> !pto.vreg<64xf32> - %out = pto.vcmin %vec, %mask : !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> - pto.vsts %out, %ub_out[%offset], %mask : !pto.vreg<64xf32>, !pto.ptr, !pto.mask - scf.yield %next_remaining : i32 - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg1, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/reduction/vcmin/launch.cpp b/test/vpto/cases/micro-op/reduction/vcmin/launch.cpp deleted file mode 100644 index 9002bcd67..000000000 --- a/test/vpto/cases/micro-op/reduction/vcmin/launch.cpp +++ /dev/null @@ -1,62 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// --------------------------------------------------------------------------- -// PTOAS compatibility layer -// -// The upstream pto-isa headers reference some FP8/FP4 types and the -// __VEC_SCOPE__ marker that are not available on every AICore arch/toolchain -// combination (e.g. __NPU_ARCH__==2201). -// -// For our PTOAS-generated kernels we don't rely on these types today, but the -// headers still mention them in templates/static_asserts. Provide minimal -// fallbacks to keep compilation working on dav-c220. -// --------------------------------------------------------------------------- -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include - -// AICore printf support is gated behind `--cce-enable-print` on some -// toolchains. When enabled, include the CCE print header so `cce::printf` -// resolves in device compilation. -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -// Some PTO-ISA types are only available in the __CCE_AICORE__ compilation -// path, but `bisheng -xcce` still performs a host-side parse pass. -// Provide minimal fallbacks only when the corresponding header wasn't -// pulled in by the selected arch implementation. -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vabs_kernel_2d(__gm__ float *v1, - __gm__ float *v2); - -void LaunchVabs_kernel_2d(float *v1, float *v2, void *stream) { - vabs_kernel_2d<<<1, nullptr, stream>>>((__gm__ float *)v1, (__gm__ float *)v2); -} diff --git a/test/vpto/cases/micro-op/reduction/vcmin/main.cpp b/test/vpto/cases/micro-op/reduction/vcmin/main.cpp deleted file mode 100644 index 29454461f..000000000 --- a/test/vpto/cases/micro-op/reduction/vcmin/main.cpp +++ /dev/null @@ -1,122 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -This program is free software, you can redistribute it and/or modify it under the terms and conditions of -CANN Open Software License Agreement Version 2.0 (the "License"). -Please refer to the License for details. You may not use this file except in compliance with the License. -THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -See LICENSE in the root of the software repository for the full text of the License. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include -#include - -using namespace PtoTestCommon; - -#ifndef TMRGSORT_HPP -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, (int)_ret, __FILE__, __LINE__); \ - const char *_recent = aclGetRecentErrMsg(); \ - if (_recent != nullptr && _recent[0] != '\0') { \ - std::fprintf(stderr, "[ERROR] RecentErrMsg: %s\n", _recent); \ - } \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVabs_kernel_2d(float *v1, float *v2, void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(float); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(float); - float *v1Host = nullptr; - float *v1Device = nullptr; - float *v2Host = nullptr; - float *v2Device = nullptr; - - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { - deviceId = std::atoi(envDevice); - } - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVabs_kernel_2d(v1Device, v2Device, stream); - - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v2Host, fileSize_v2, v2Device, fileSize_v2, ACL_MEMCPY_DEVICE_TO_HOST)); - - WriteFile("./v2.bin", v2Host, fileSize_v2); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - if (stream != nullptr) { - const aclError _ret = aclrtDestroyStream(stream); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtDestroyStream(stream)", (int)_ret, __FILE__, __LINE__); - } - stream = nullptr; - } - if (deviceSet) { - const aclError _ret = aclrtResetDevice(deviceId); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtResetDevice(deviceId)", (int)_ret, __FILE__, __LINE__); - } - } - if (aclInited) { - const aclError _ret = aclFinalize(); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclFinalize()", (int)_ret, __FILE__, __LINE__); - } - } - - return rc; -} diff --git a/test/vpto/cases/micro-op/reduction/vcpadd-tail/compare.py b/test/vpto/cases/micro-op/reduction/vcpadd-tail/compare.py deleted file mode 100755 index 59d97b87c..000000000 --- a/test/vpto/cases/micro-op/reduction/vcpadd-tail/compare.py +++ /dev/null @@ -1,211 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/reduction/vcpadd-tail -# family: reduction -# target_ops: pto.vcpadd -# scenarios: prefix-op, tail-mask -# NOTE: bulk-generated coverage skeleton. -# coding=utf-8 - -import os -import sys -import numpy as np - - -REPEAT_BYTES = 256 -LOGICAL_ELEMS = 1000 - - -def _ceil_div(x, y): - return (x + y - 1) // y - - -def _packed_pred_storage_bytes(logical_elems, src_elem_bytes): - logical_elems = int(logical_elems) - src_elem_bytes = int(src_elem_bytes) - if logical_elems <= 0: - raise ValueError(f"logical_elems must be > 0, got {logical_elems}") - if src_elem_bytes not in (1, 2, 4): - raise ValueError(f"unsupported packed predicate source size: {src_elem_bytes}") - - repeat_elems = REPEAT_BYTES // src_elem_bytes - if src_elem_bytes == 4: - repeat_times = _ceil_div(logical_elems, repeat_elems) + 1 - loop_count = repeat_times // 2 - return loop_count * 16 - - repeat_times = _ceil_div(logical_elems, repeat_elems) - return repeat_times * (repeat_elems // 8) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - g = golden.astype(np.int64, copy=False) - o = output.astype(np.int64, copy=False) - else: - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - abs_diff = np.abs(g - o) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={g[idx]}, out={o[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_bin_prefix(golden_path, output_path, dtype, eps, count): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - try: - count = int(count) - except Exception: - print(f"[ERROR] Invalid prefix count: {count}") - return False - if count <= 0: - print(f"[ERROR] Invalid prefix count: {count}") - return False - - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np, count=count) - output = np.fromfile(output_path, dtype=dtype_np, count=count) - - if golden.size != count or output.size != count: - print( - f"[ERROR] Prefix read too small: need={count} elems, " - f"golden={golden.size}, out={output.size}" - ) - return False - - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - g = golden.astype(np.int64, copy=False) - o = output.astype(np.int64, copy=False) - else: - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - abs_diff = np.abs(g - o) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch (prefix): {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={g[idx]}, out={o[idx]}, dtype={dtype_np}, count={count})" - ) - else: - print(f"[ERROR] Mismatch (prefix): {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, logical_elems, src_elem_bytes): - """ - Compare outputs of pto.tcmp / pto.tcmps. - - PTO-ISA stores packed predicate results as a linear PK byte stream via - `psts`, with the exact written prefix length determined by the typed - TCMP/TCMPS repeat schedule. Compare only that semantic prefix. - """ - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - try: - logical_elems = int(logical_elems) - src_elem_bytes = int(src_elem_bytes) - except Exception: - print( - "[ERROR] Invalid packed mask compare arguments: " - f"logical_elems={logical_elems} src_elem_bytes={src_elem_bytes}" - ) - return False - if logical_elems <= 0 or src_elem_bytes <= 0: - print( - "[ERROR] Invalid packed mask compare arguments: " - f"logical_elems={logical_elems} src_elem_bytes={src_elem_bytes}" - ) - return False - - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - try: - prefix_bytes = _packed_pred_storage_bytes(logical_elems, src_elem_bytes) - except ValueError as exc: - print(f"[ERROR] {exc}") - return False - - if golden.size < prefix_bytes or output.size < prefix_bytes: - print( - f"[ERROR] Packed mask buffer too small: need={prefix_bytes} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - - golden_sel = golden[:prefix_bytes] - output_sel = output[:prefix_bytes] - - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, " - f"logical_elems={logical_elems}, src_elem_bytes={src_elem_bytes}, prefix_bytes={prefix_bytes})" - ) - return False - return True - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = True - ok = compare_bin_prefix("golden_v2.bin", "v2.bin", np.float32, 0.0001, - LOGICAL_ELEMS) and ok - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/reduction/vcpadd-tail/golden.py b/test/vpto/cases/micro-op/reduction/vcpadd-tail/golden.py deleted file mode 100755 index 08dc83922..000000000 --- a/test/vpto/cases/micro-op/reduction/vcpadd-tail/golden.py +++ /dev/null @@ -1,56 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -ROWS = 32 -COLS = 32 -SEED = 19 -LANES = 64 -LOGICAL_ELEMS = 1000 -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) - v1 = rng.uniform(-8.0, 8.0, size=(ROWS, COLS)).astype(np.float32) - - v2 = np.zeros((ROWS, COLS), dtype=np.float32) - golden_v2 = np.zeros((ROWS, COLS), dtype=np.float32) - flat_in = v1.reshape(-1) - flat_out = golden_v2.reshape(-1) - for offset in range(0, LOGICAL_ELEMS, LANES): - chunk = flat_in[offset:min(offset + LANES, LOGICAL_ELEMS)] - pair_count = (chunk.size + 1) // 2 - for i in range(pair_count): - a = chunk[2 * i] - b = chunk[2 * i + 1] if (2 * i + 1) < chunk.size else np.float32(0.0) - # VCPADD writes pair-reduction results to low half lanes. - flat_out[offset + i] = np.float32(a + b) - - - output_dir.mkdir(parents=True, exist_ok=True) - v1.reshape(-1).tofile(output_dir / "v1.bin") - v2.reshape(-1).tofile(output_dir / "v2.bin") - golden_v2.reshape(-1).tofile(output_dir / "golden_v2.bin") - - -def main() -> None: - parser = argparse.ArgumentParser() - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/reduction/vcpadd-tail/kernel.pto b/test/vpto/cases/micro-op/reduction/vcpadd-tail/kernel.pto deleted file mode 100644 index 84f25e634..000000000 --- a/test/vpto/cases/micro-op/reduction/vcpadd-tail/kernel.pto +++ /dev/null @@ -1,51 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/reduction/vcpadd-tail -// family: reduction -// target_ops: pto.vcpadd -// scenarios: prefix-op, tail-mask -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vcpadd_tail_kernel_2d(%arg0: !pto.ptr, %arg1: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c64 = arith.constant 64 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c128_i64 = arith.constant 128 : i64 - %c4096_i64 = arith.constant 4096 : i64 - %c1000_i32 = arith.constant 1000 : i32 - - %ub_in = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c4096_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_in, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %_:1 = scf.for %offset = %c0 to %c1024 step %c64 iter_args(%remaining = %c1000_i32) -> (i32) { - %mask, %next_remaining = pto.plt_b32 %remaining : i32 -> !pto.mask, i32 - %vec = pto.vlds %ub_in[%offset] : !pto.ptr -> !pto.vreg<64xf32> - %out = pto.vcpadd %vec, %mask : !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> - pto.vsts %out, %ub_out[%offset], %mask : !pto.vreg<64xf32>, !pto.ptr, !pto.mask - scf.yield %next_remaining : i32 - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg1, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/reduction/vcpadd-tail/launch.cpp b/test/vpto/cases/micro-op/reduction/vcpadd-tail/launch.cpp deleted file mode 100644 index 08c0b9ad5..000000000 --- a/test/vpto/cases/micro-op/reduction/vcpadd-tail/launch.cpp +++ /dev/null @@ -1,70 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/reduction/vcpadd-tail -// family: reduction -// target_ops: pto.vcpadd -// scenarios: prefix-op, tail-mask -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -// --------------------------------------------------------------------------- -// PTOAS compatibility layer -// -// The upstream pto-isa headers reference some FP8/FP4 types and the -// __VEC_SCOPE__ marker that are not available on every AICore arch/toolchain -// combination (e.g. __NPU_ARCH__==2201). -// -// For our PTOAS-generated kernels we don't rely on these types today, but the -// headers still mention them in templates/static_asserts. Provide minimal -// fallbacks to keep compilation working on dav-c220. -// --------------------------------------------------------------------------- -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include - -// AICore printf support is gated behind `--cce-enable-print` on some -// toolchains. When enabled, include the CCE print header so `cce::printf` -// resolves in device compilation. -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -// Some PTO-ISA types are only available in the __CCE_AICORE__ compilation -// path, but `bisheng -xcce` still performs a host-side parse pass. -// Provide minimal fallbacks only when the corresponding header wasn't -// pulled in by the selected arch implementation. -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vcpadd_tail_kernel_2d(__gm__ float *v1, - __gm__ float *v2); - -void LaunchVcpaddTail_kernel_2d(float *v1, float *v2, void *stream) { - vcpadd_tail_kernel_2d<<<1, nullptr, stream>>>((__gm__ float *)v1, (__gm__ float *)v2); -} diff --git a/test/vpto/cases/micro-op/reduction/vcpadd-tail/main.cpp b/test/vpto/cases/micro-op/reduction/vcpadd-tail/main.cpp deleted file mode 100644 index d571471dc..000000000 --- a/test/vpto/cases/micro-op/reduction/vcpadd-tail/main.cpp +++ /dev/null @@ -1,130 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/reduction/vcpadd-tail -// family: reduction -// target_ops: pto.vcpadd -// scenarios: prefix-op, tail-mask -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -This program is free software, you can redistribute it and/or modify it under the terms and conditions of -CANN Open Software License Agreement Version 2.0 (the "License"). -Please refer to the License for details. You may not use this file except in compliance with the License. -THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -See LICENSE in the root of the software repository for the full text of the License. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include -#include - -using namespace PtoTestCommon; - -#ifndef TMRGSORT_HPP -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, (int)_ret, __FILE__, __LINE__); \ - const char *_recent = aclGetRecentErrMsg(); \ - if (_recent != nullptr && _recent[0] != '\0') { \ - std::fprintf(stderr, "[ERROR] RecentErrMsg: %s\n", _recent); \ - } \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVcpaddTail_kernel_2d(float *v1, float *v2, void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(float); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(float); - float *v1Host = nullptr; - float *v1Device = nullptr; - float *v2Host = nullptr; - float *v2Device = nullptr; - - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { - deviceId = std::atoi(envDevice); - } - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVcpaddTail_kernel_2d(v1Device, v2Device, stream); - - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v2Host, fileSize_v2, v2Device, fileSize_v2, ACL_MEMCPY_DEVICE_TO_HOST)); - - WriteFile("./v2.bin", v2Host, fileSize_v2); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - if (stream != nullptr) { - const aclError _ret = aclrtDestroyStream(stream); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtDestroyStream(stream)", (int)_ret, __FILE__, __LINE__); - } - stream = nullptr; - } - if (deviceSet) { - const aclError _ret = aclrtResetDevice(deviceId); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtResetDevice(deviceId)", (int)_ret, __FILE__, __LINE__); - } - } - if (aclInited) { - const aclError _ret = aclFinalize(); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclFinalize()", (int)_ret, __FILE__, __LINE__); - } - } - - return rc; -} diff --git a/test/vpto/cases/micro-op/reduction/vcpadd/compare.py b/test/vpto/cases/micro-op/reduction/vcpadd/compare.py deleted file mode 100755 index 8094ed94e..000000000 --- a/test/vpto/cases/micro-op/reduction/vcpadd/compare.py +++ /dev/null @@ -1,209 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/reduction/vcpadd -# family: reduction -# target_ops: pto.vcpadd -# scenarios: prefix-op, full-mask -# NOTE: bulk-generated coverage skeleton. -# coding=utf-8 - -import os -import sys -import numpy as np - - -REPEAT_BYTES = 256 - - -def _ceil_div(x, y): - return (x + y - 1) // y - - -def _packed_pred_storage_bytes(logical_elems, src_elem_bytes): - logical_elems = int(logical_elems) - src_elem_bytes = int(src_elem_bytes) - if logical_elems <= 0: - raise ValueError(f"logical_elems must be > 0, got {logical_elems}") - if src_elem_bytes not in (1, 2, 4): - raise ValueError(f"unsupported packed predicate source size: {src_elem_bytes}") - - repeat_elems = REPEAT_BYTES // src_elem_bytes - if src_elem_bytes == 4: - repeat_times = _ceil_div(logical_elems, repeat_elems) + 1 - loop_count = repeat_times // 2 - return loop_count * 16 - - repeat_times = _ceil_div(logical_elems, repeat_elems) - return repeat_times * (repeat_elems // 8) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - g = golden.astype(np.int64, copy=False) - o = output.astype(np.int64, copy=False) - else: - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - abs_diff = np.abs(g - o) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={g[idx]}, out={o[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_bin_prefix(golden_path, output_path, dtype, eps, count): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - try: - count = int(count) - except Exception: - print(f"[ERROR] Invalid prefix count: {count}") - return False - if count <= 0: - print(f"[ERROR] Invalid prefix count: {count}") - return False - - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np, count=count) - output = np.fromfile(output_path, dtype=dtype_np, count=count) - - if golden.size != count or output.size != count: - print( - f"[ERROR] Prefix read too small: need={count} elems, " - f"golden={golden.size}, out={output.size}" - ) - return False - - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - g = golden.astype(np.int64, copy=False) - o = output.astype(np.int64, copy=False) - else: - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - abs_diff = np.abs(g - o) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch (prefix): {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={g[idx]}, out={o[idx]}, dtype={dtype_np}, count={count})" - ) - else: - print(f"[ERROR] Mismatch (prefix): {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, logical_elems, src_elem_bytes): - """ - Compare outputs of pto.tcmp / pto.tcmps. - - PTO-ISA stores packed predicate results as a linear PK byte stream via - `psts`, with the exact written prefix length determined by the typed - TCMP/TCMPS repeat schedule. Compare only that semantic prefix. - """ - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - try: - logical_elems = int(logical_elems) - src_elem_bytes = int(src_elem_bytes) - except Exception: - print( - "[ERROR] Invalid packed mask compare arguments: " - f"logical_elems={logical_elems} src_elem_bytes={src_elem_bytes}" - ) - return False - if logical_elems <= 0 or src_elem_bytes <= 0: - print( - "[ERROR] Invalid packed mask compare arguments: " - f"logical_elems={logical_elems} src_elem_bytes={src_elem_bytes}" - ) - return False - - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - try: - prefix_bytes = _packed_pred_storage_bytes(logical_elems, src_elem_bytes) - except ValueError as exc: - print(f"[ERROR] {exc}") - return False - - if golden.size < prefix_bytes or output.size < prefix_bytes: - print( - f"[ERROR] Packed mask buffer too small: need={prefix_bytes} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - - golden_sel = golden[:prefix_bytes] - output_sel = output[:prefix_bytes] - - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, " - f"logical_elems={logical_elems}, src_elem_bytes={src_elem_bytes}, prefix_bytes={prefix_bytes})" - ) - return False - return True - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = True - ok = compare_bin("golden_v2.bin", "v2.bin", np.float32, 0.0001) and ok - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/reduction/vcpadd/golden.py b/test/vpto/cases/micro-op/reduction/vcpadd/golden.py deleted file mode 100755 index eb41c69f0..000000000 --- a/test/vpto/cases/micro-op/reduction/vcpadd/golden.py +++ /dev/null @@ -1,55 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -ROWS = 32 -COLS = 32 -SEED = 19 -LANES = 64 -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) - v1 = rng.uniform(-8.0, 8.0, size=(ROWS, COLS)).astype(np.float32) - - v2 = np.zeros((ROWS, COLS), dtype=np.float32) - golden_v2 = np.zeros((ROWS, COLS), dtype=np.float32) - flat_in = v1.reshape(-1) - flat_out = golden_v2.reshape(-1) - for offset in range(0, flat_in.size, LANES): - chunk = flat_in[offset:offset + LANES] - pair_count = (chunk.size + 1) // 2 - for i in range(pair_count): - a = chunk[2 * i] - b = chunk[2 * i + 1] if (2 * i + 1) < chunk.size else np.float32(0.0) - # VCPADD writes pair-reduction results to low half lanes. - flat_out[offset + i] = np.float32(a + b) - - - output_dir.mkdir(parents=True, exist_ok=True) - v1.reshape(-1).tofile(output_dir / "v1.bin") - v2.reshape(-1).tofile(output_dir / "v2.bin") - golden_v2.reshape(-1).tofile(output_dir / "golden_v2.bin") - - -def main() -> None: - parser = argparse.ArgumentParser() - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/reduction/vcpadd/kernel.pto b/test/vpto/cases/micro-op/reduction/vcpadd/kernel.pto deleted file mode 100644 index 3b4bf2f9d..000000000 --- a/test/vpto/cases/micro-op/reduction/vcpadd/kernel.pto +++ /dev/null @@ -1,51 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/reduction/vcpadd -// family: reduction -// target_ops: pto.vcpadd -// scenarios: prefix-op, full-mask -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vcpadd_kernel_2d(%arg0: !pto.ptr, %arg1: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c64 = arith.constant 64 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c128_i64 = arith.constant 128 : i64 - %c4096_i64 = arith.constant 4096 : i64 - %c1024_i32 = arith.constant 1024 : i32 - - %ub_in = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c4096_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_in, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %_:1 = scf.for %offset = %c0 to %c1024 step %c64 iter_args(%remaining = %c1024_i32) -> (i32) { - %mask, %next_remaining = pto.plt_b32 %remaining : i32 -> !pto.mask, i32 - %vec = pto.vlds %ub_in[%offset] : !pto.ptr -> !pto.vreg<64xf32> - %out = pto.vcpadd %vec, %mask : !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> - pto.vsts %out, %ub_out[%offset], %mask : !pto.vreg<64xf32>, !pto.ptr, !pto.mask - scf.yield %next_remaining : i32 - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg1, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/reduction/vcpadd/launch.cpp b/test/vpto/cases/micro-op/reduction/vcpadd/launch.cpp deleted file mode 100644 index ad26d59b2..000000000 --- a/test/vpto/cases/micro-op/reduction/vcpadd/launch.cpp +++ /dev/null @@ -1,70 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/reduction/vcpadd -// family: reduction -// target_ops: pto.vcpadd -// scenarios: prefix-op, full-mask -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -// --------------------------------------------------------------------------- -// PTOAS compatibility layer -// -// The upstream pto-isa headers reference some FP8/FP4 types and the -// __VEC_SCOPE__ marker that are not available on every AICore arch/toolchain -// combination (e.g. __NPU_ARCH__==2201). -// -// For our PTOAS-generated kernels we don't rely on these types today, but the -// headers still mention them in templates/static_asserts. Provide minimal -// fallbacks to keep compilation working on dav-c220. -// --------------------------------------------------------------------------- -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include - -// AICore printf support is gated behind `--cce-enable-print` on some -// toolchains. When enabled, include the CCE print header so `cce::printf` -// resolves in device compilation. -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -// Some PTO-ISA types are only available in the __CCE_AICORE__ compilation -// path, but `bisheng -xcce` still performs a host-side parse pass. -// Provide minimal fallbacks only when the corresponding header wasn't -// pulled in by the selected arch implementation. -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vcpadd_kernel_2d(__gm__ float *v1, - __gm__ float *v2); - -void LaunchVcpadd_kernel_2d(float *v1, float *v2, void *stream) { - vcpadd_kernel_2d<<<1, nullptr, stream>>>((__gm__ float *)v1, (__gm__ float *)v2); -} diff --git a/test/vpto/cases/micro-op/reduction/vcpadd/main.cpp b/test/vpto/cases/micro-op/reduction/vcpadd/main.cpp deleted file mode 100644 index 7f62d2606..000000000 --- a/test/vpto/cases/micro-op/reduction/vcpadd/main.cpp +++ /dev/null @@ -1,130 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/reduction/vcpadd -// family: reduction -// target_ops: pto.vcpadd -// scenarios: prefix-op, full-mask -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -This program is free software, you can redistribute it and/or modify it under the terms and conditions of -CANN Open Software License Agreement Version 2.0 (the "License"). -Please refer to the License for details. You may not use this file except in compliance with the License. -THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -See LICENSE in the root of the software repository for the full text of the License. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include -#include - -using namespace PtoTestCommon; - -#ifndef TMRGSORT_HPP -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, (int)_ret, __FILE__, __LINE__); \ - const char *_recent = aclGetRecentErrMsg(); \ - if (_recent != nullptr && _recent[0] != '\0') { \ - std::fprintf(stderr, "[ERROR] RecentErrMsg: %s\n", _recent); \ - } \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVcpadd_kernel_2d(float *v1, float *v2, void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(float); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(float); - float *v1Host = nullptr; - float *v1Device = nullptr; - float *v2Host = nullptr; - float *v2Device = nullptr; - - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { - deviceId = std::atoi(envDevice); - } - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVcpadd_kernel_2d(v1Device, v2Device, stream); - - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v2Host, fileSize_v2, v2Device, fileSize_v2, ACL_MEMCPY_DEVICE_TO_HOST)); - - WriteFile("./v2.bin", v2Host, fileSize_v2); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - if (stream != nullptr) { - const aclError _ret = aclrtDestroyStream(stream); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtDestroyStream(stream)", (int)_ret, __FILE__, __LINE__); - } - stream = nullptr; - } - if (deviceSet) { - const aclError _ret = aclrtResetDevice(deviceId); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtResetDevice(deviceId)", (int)_ret, __FILE__, __LINE__); - } - } - if (aclInited) { - const aclError _ret = aclFinalize(); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclFinalize()", (int)_ret, __FILE__, __LINE__); - } - } - - return rc; -} diff --git a/test/vpto/cases/micro-op/scalar-load-store/load-store-scalar-ub/golden.py b/test/vpto/cases/micro-op/scalar-load-store/load-store-scalar-ub/golden.py index a4fb8dd28..42f4c71e4 100644 --- a/test/vpto/cases/micro-op/scalar-load-store/load-store-scalar-ub/golden.py +++ b/test/vpto/cases/micro-op/scalar-load-store/load-store-scalar-ub/golden.py @@ -15,7 +15,8 @@ import numpy as np -ELEMS = 1024 +ELEMS = 32 +ACTIVE_ELEMS = 8 SEED = 19 @@ -23,7 +24,8 @@ def generate(output_dir: Path, seed: int) -> None: rng = np.random.default_rng(seed) v1 = rng.integers(-20000, 20000, size=ELEMS, dtype=np.int16) v2 = np.zeros(ELEMS, dtype=np.int16) - golden_v2 = (v1.astype(np.int32) + 4).astype(np.int16) + golden_v2 = v1.copy() + golden_v2[:ACTIVE_ELEMS] = (v1[:ACTIVE_ELEMS].astype(np.int32) + 4).astype(np.int16) output_dir.mkdir(parents=True, exist_ok=True) v1.tofile(output_dir / "v1.bin") diff --git a/test/vpto/cases/micro-op/scalar-load-store/load-store-scalar-ub/kernel.pto b/test/vpto/cases/micro-op/scalar-load-store/load-store-scalar-ub/kernel.pto index 5957546cc..0e952e74e 100644 --- a/test/vpto/cases/micro-op/scalar-load-store/load-store-scalar-ub/kernel.pto +++ b/test/vpto/cases/micro-op/scalar-load-store/load-store-scalar-ub/kernel.pto @@ -8,10 +8,9 @@ module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind, %arg1: !pto.ptr) attributes {pto.aicore} { %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index - %c1024 = arith.constant 1024 : index + %c8 = arith.constant 8 : index %c0_i64 = arith.constant 0 : i64 %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 %c64_i64 = arith.constant 64 : i64 %c2048_i64 = arith.constant 2048 : i64 %c7_i16 = arith.constant 7 : i16 @@ -20,16 +19,18 @@ module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind !pto.ptr %ub_out = pto.castptr %c2048_i64 : i64 -> !pto.ptr - %false = arith.constant false pto.mte_gm_ub %arg0, %ub_in, %c0_i64, %c64_i64 - nburst(%c32_i64, %c64_i64, %c64_i64) + nburst(%c1_i64, %c64_i64, %c64_i64) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + pto.mte_gm_ub %arg0, %ub_out, %c0_i64, %c64_i64 + nburst(%c1_i64, %c64_i64, %c64_i64) : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] pto.vecscope { - scf.for %offset = %c0 to %c1024 step %c1 { + scf.for %offset = %c0 to %c8 step %c1 { %loaded = pto.load_scalar %ub_in[%offset] : !pto.ptr -> i16 %biased = arith.addi %loaded, %c7_i16 : i16 pto.store_scalar %biased, %ub_out[%offset] : !pto.ptr, i16 @@ -42,7 +43,7 @@ module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind, !pto.ptr, i64, i64, i64, i64 pto.barrier #pto.pipe return diff --git a/test/vpto/cases/micro-op/scalar-load-store/load-store-scalar-ub/main.cpp b/test/vpto/cases/micro-op/scalar-load-store/load-store-scalar-ub/main.cpp index ff7a3e98b..49e34aeec 100644 --- a/test/vpto/cases/micro-op/scalar-load-store/load-store-scalar-ub/main.cpp +++ b/test/vpto/cases/micro-op/scalar-load-store/load-store-scalar-ub/main.cpp @@ -50,7 +50,7 @@ struct MrgSortExecutedNumList { void LaunchLoad_store_scalar_ub_kernel(int16_t *v1, int16_t *v2, void *stream); int main() { - size_t elemCount_v1 = 1024; + size_t elemCount_v1 = 32; size_t fileSize_v1 = elemCount_v1 * sizeof(int16_t); int16_t *v1Host = nullptr; int16_t *v2Host = nullptr; diff --git a/test/vpto/cases/micro-op/simt/simt-atomic-cas-f32-core/golden.py b/test/vpto/cases/micro-op/simt/simt-atomic-cas-f32-core/golden.py deleted file mode 100644 index 1a6176230..000000000 --- a/test/vpto/cases/micro-op/simt/simt-atomic-cas-f32-core/golden.py +++ /dev/null @@ -1,40 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -import argparse -from pathlib import Path - -import numpy as np - -ELEMS = 1024 - - -def generate(output_dir: Path) -> None: - output_dir.mkdir(parents=True, exist_ok=True) - v1 = np.full(ELEMS, -1.0, dtype=np.float32) - golden_v1 = np.full(ELEMS, -1.0, dtype=np.float32) - v1[:8] = np.full(8, 10.0, dtype=np.float32) - v1[8:16] = np.full(8, 11.0, dtype=np.float32) - golden_v1[:8] = np.full(8, 15.0, dtype=np.float32) - golden_v1[8:16] = np.full(8, 11.0, dtype=np.float32) - golden_v1[16:24] = np.full(8, 10.0, dtype=np.float32) - golden_v1[24:32] = np.full(8, 11.0, dtype=np.float32) - v1.tofile(output_dir / "v1.bin") - golden_v1.tofile(output_dir / "golden_v1.bin") - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument("--output-dir", type=Path, default=Path(".")) - args = parser.parse_args() - generate(args.output_dir) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/simt/simt-atomic-cas-f32-core/kernel.pto b/test/vpto/cases/micro-op/simt/simt-atomic-cas-f32-core/kernel.pto deleted file mode 100644 index 676878c4d..000000000 --- a/test/vpto/cases/micro-op/simt/simt-atomic-cas-f32-core/kernel.pto +++ /dev/null @@ -1,90 +0,0 @@ -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @simt_atomic_cas_f32_core_kernel(%arg0: !pto.ptr) attributes {pto.aicore} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c2 = arith.constant 2 : index - %c3 = arith.constant 3 : index - %c4 = arith.constant 4 : index - %c5 = arith.constant 5 : index - %c6 = arith.constant 6 : index - %c7 = arith.constant 7 : index - %c8 = arith.constant 8 : index - %c9 = arith.constant 9 : index - %c10 = arith.constant 10 : index - %c11 = arith.constant 11 : index - %c12 = arith.constant 12 : index - %c13 = arith.constant 13 : index - %c14 = arith.constant 14 : index - %c15 = arith.constant 15 : index - %c16 = arith.constant 16 : index - %c17 = arith.constant 17 : index - %c18 = arith.constant 18 : index - %c19 = arith.constant 19 : index - %c20 = arith.constant 20 : index - %c21 = arith.constant 21 : index - %c22 = arith.constant 22 : index - %c23 = arith.constant 23 : index - %c24 = arith.constant 24 : index - %c25 = arith.constant 25 : index - %c26 = arith.constant 26 : index - %c27 = arith.constant 27 : index - %c28 = arith.constant 28 : index - %c29 = arith.constant 29 : index - %c30 = arith.constant 30 : index - %c31 = arith.constant 31 : index - %v10 = arith.constant 1.000000e+01 : f32 - %v15 = arith.constant 1.500000e+01 : f32 - - %ptr0 = pto.addptr %arg0, %c0 : !pto.ptr -> !pto.ptr - %old0 = pto.atomic_cas %ptr0, %v10, %v15 l2cache(nmfv) : !pto.ptr, f32 -> f32 - %ptr1 = pto.addptr %arg0, %c1 : !pto.ptr -> !pto.ptr - %old1 = pto.atomic_cas %ptr1, %v10, %v15 l2cache(nmlv) : !pto.ptr, f32 -> f32 - %ptr2 = pto.addptr %arg0, %c2 : !pto.ptr -> !pto.ptr - %old2 = pto.atomic_cas %ptr2, %v10, %v15 l2cache(nmprs) : !pto.ptr, f32 -> f32 - %ptr3 = pto.addptr %arg0, %c3 : !pto.ptr -> !pto.ptr - %old3 = pto.atomic_cas %ptr3, %v10, %v15 l2cache(nmred) : !pto.ptr, f32 -> f32 - %ptr4 = pto.addptr %arg0, %c4 : !pto.ptr -> !pto.ptr - %old4 = pto.atomic_cas %ptr4, %v10, %v15 l2cache(naci) : !pto.ptr, f32 -> f32 - %ptr5 = pto.addptr %arg0, %c5 : !pto.ptr -> !pto.ptr - %old5 = pto.atomic_cas %ptr5, %v10, %v15 l2cache(napw) : !pto.ptr, f32 -> f32 - %ptr6 = pto.addptr %arg0, %c6 : !pto.ptr -> !pto.ptr - %old6 = pto.atomic_cas %ptr6, %v10, %v15 l2cache(napi) : !pto.ptr, f32 -> f32 - %ptr7 = pto.addptr %arg0, %c7 : !pto.ptr -> !pto.ptr - %old7 = pto.atomic_cas %ptr7, %v10, %v15 l2cache(nared) : !pto.ptr, f32 -> f32 - %ptr8 = pto.addptr %arg0, %c8 : !pto.ptr -> !pto.ptr - %old8 = pto.atomic_cas %ptr8, %v10, %v15 l2cache(wbhfv) : !pto.ptr, f32 -> f32 - %ptr9 = pto.addptr %arg0, %c9 : !pto.ptr -> !pto.ptr - %old9 = pto.atomic_cas %ptr9, %v10, %v15 l2cache(wbhlv) : !pto.ptr, f32 -> f32 - %ptr10 = pto.addptr %arg0, %c10 : !pto.ptr -> !pto.ptr - %old10 = pto.atomic_cas %ptr10, %v10, %v15 l2cache(wbhprs) : !pto.ptr, f32 -> f32 - %ptr11 = pto.addptr %arg0, %c11 : !pto.ptr -> !pto.ptr - %old11 = pto.atomic_cas %ptr11, %v10, %v15 l2cache(wbhred) : !pto.ptr, f32 -> f32 - %ptr12 = pto.addptr %arg0, %c12 : !pto.ptr -> !pto.ptr - %old12 = pto.atomic_cas %ptr12, %v10, %v15 l2cache(wtsfv) : !pto.ptr, f32 -> f32 - %ptr13 = pto.addptr %arg0, %c13 : !pto.ptr -> !pto.ptr - %old13 = pto.atomic_cas %ptr13, %v10, %v15 l2cache(wtslv) : !pto.ptr, f32 -> f32 - %ptr14 = pto.addptr %arg0, %c14 : !pto.ptr -> !pto.ptr - %old14 = pto.atomic_cas %ptr14, %v10, %v15 l2cache(wtsprs) : !pto.ptr, f32 -> f32 - %ptr15 = pto.addptr %arg0, %c15 : !pto.ptr -> !pto.ptr - %old15 = pto.atomic_cas %ptr15, %v10, %v15 l2cache(wtsred) : !pto.ptr, f32 -> f32 - - pto.store %old0, %arg0[%c16] : !pto.ptr, f32 - pto.store %old1, %arg0[%c17] : !pto.ptr, f32 - pto.store %old2, %arg0[%c18] : !pto.ptr, f32 - pto.store %old3, %arg0[%c19] : !pto.ptr, f32 - pto.store %old4, %arg0[%c20] : !pto.ptr, f32 - pto.store %old5, %arg0[%c21] : !pto.ptr, f32 - pto.store %old6, %arg0[%c22] : !pto.ptr, f32 - pto.store %old7, %arg0[%c23] : !pto.ptr, f32 - pto.store %old8, %arg0[%c24] : !pto.ptr, f32 - pto.store %old9, %arg0[%c25] : !pto.ptr, f32 - pto.store %old10, %arg0[%c26] : !pto.ptr, f32 - pto.store %old11, %arg0[%c27] : !pto.ptr, f32 - pto.store %old12, %arg0[%c28] : !pto.ptr, f32 - pto.store %old13, %arg0[%c29] : !pto.ptr, f32 - pto.store %old14, %arg0[%c30] : !pto.ptr, f32 - pto.store %old15, %arg0[%c31] : !pto.ptr, f32 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/simt/simt-atomic-cas-f32-core/launch.cpp b/test/vpto/cases/micro-op/simt/simt-atomic-cas-f32-core/launch.cpp deleted file mode 100644 index 6f780fe69..000000000 --- a/test/vpto/cases/micro-op/simt/simt-atomic-cas-f32-core/launch.cpp +++ /dev/null @@ -1,19 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif -#include -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif -extern "C" __global__ [aicore] void simt_atomic_cas_f32_core_kernel(__gm__ float *v1); -void LaunchSimt_atomic_cas_f32_core_kernel(float *v1, void *stream) { - simt_atomic_cas_f32_core_kernel<<<1, nullptr, stream>>>((__gm__ float *)v1); -} diff --git a/test/vpto/cases/micro-op/simt/simt-atomic-cas-f32-core/main.cpp b/test/vpto/cases/micro-op/simt/simt-atomic-cas-f32-core/main.cpp deleted file mode 100644 index 2217c0901..000000000 --- a/test/vpto/cases/micro-op/simt/simt-atomic-cas-f32-core/main.cpp +++ /dev/null @@ -1,57 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -#include "test_common.h" -#include "acl/acl.h" -#include -#include - -using namespace PtoTestCommon; - -#define ACL_CHECK(expr) do { const aclError _ret = (expr); if (_ret != ACL_SUCCESS) { std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, (int)_ret, __FILE__, __LINE__); rc = 1; goto cleanup; } } while (0) - -void LaunchSimt_atomic_cas_f32_core_kernel(float *v1, void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(float); - float *v1Host = nullptr; - float *v1Device = nullptr; - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) - deviceId = std::atoi(envDevice); - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchSimt_atomic_cas_f32_core_kernel(v1Device, stream); - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v1Host, fileSize_v1, v1Device, fileSize_v1, ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v1.bin", v1Host, fileSize_v1); - -cleanup: - aclrtFree(v1Device); - aclrtFreeHost(v1Host); - if (stream) - aclrtDestroyStream(stream); - if (deviceSet) - aclrtResetDevice(deviceId); - if (aclInited) - aclFinalize(); - return rc; -} diff --git a/test/vpto/cases/micro-op/simt/simt-atomic-exch-f32-core/compare.py b/test/vpto/cases/micro-op/simt/simt-atomic-exch-f32-core/compare.py deleted file mode 100644 index 4766e865b..000000000 --- a/test/vpto/cases/micro-op/simt/simt-atomic-exch-f32-core/compare.py +++ /dev/null @@ -1,33 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -import os -import sys - -import numpy as np - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - golden = np.fromfile("golden_v1.bin", dtype=np.float32) - out = np.fromfile("v1.bin", dtype=np.float32) - ok = golden.shape == out.shape and np.array_equal(golden, out) - if not ok: - idxs = np.nonzero(golden != out)[0] - idx = int(idxs[0]) if idxs.size else 0 - print( - f"[ERROR] mismatch at idx={idx}, golden={float(golden[idx])}, out={float(out[idx])}" - ) - if strict: - sys.exit(2) - print("[INFO] compare passed" if ok else "[WARN] compare failed (non-gating)") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/simt/simt-atomic-exch-f32-core/golden.py b/test/vpto/cases/micro-op/simt/simt-atomic-exch-f32-core/golden.py deleted file mode 100644 index 403c392e5..000000000 --- a/test/vpto/cases/micro-op/simt/simt-atomic-exch-f32-core/golden.py +++ /dev/null @@ -1,37 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -import argparse -from pathlib import Path - -import numpy as np - -ELEMS = 1024 - - -def generate(output_dir: Path) -> None: - output_dir.mkdir(parents=True, exist_ok=True) - v1 = np.full(ELEMS, -1.0, dtype=np.float32) - golden_v1 = np.full(ELEMS, -1.0, dtype=np.float32) - v1[:16] = np.full(16, 10.0, dtype=np.float32) - golden_v1[:16] = np.full(16, 15.0, dtype=np.float32) - golden_v1[16:32] = np.full(16, 10.0, dtype=np.float32) - v1.tofile(output_dir / "v1.bin") - golden_v1.tofile(output_dir / "golden_v1.bin") - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument("--output-dir", type=Path, default=Path(".")) - args = parser.parse_args() - generate(args.output_dir) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/simt/simt-atomic-exch-f32-core/kernel.pto b/test/vpto/cases/micro-op/simt/simt-atomic-exch-f32-core/kernel.pto deleted file mode 100644 index d1154707f..000000000 --- a/test/vpto/cases/micro-op/simt/simt-atomic-exch-f32-core/kernel.pto +++ /dev/null @@ -1,89 +0,0 @@ -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @simt_atomic_exch_f32_core_kernel(%arg0: !pto.ptr) attributes {pto.aicore} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c2 = arith.constant 2 : index - %c3 = arith.constant 3 : index - %c4 = arith.constant 4 : index - %c5 = arith.constant 5 : index - %c6 = arith.constant 6 : index - %c7 = arith.constant 7 : index - %c8 = arith.constant 8 : index - %c9 = arith.constant 9 : index - %c10 = arith.constant 10 : index - %c11 = arith.constant 11 : index - %c12 = arith.constant 12 : index - %c13 = arith.constant 13 : index - %c14 = arith.constant 14 : index - %c15 = arith.constant 15 : index - %c16 = arith.constant 16 : index - %c17 = arith.constant 17 : index - %c18 = arith.constant 18 : index - %c19 = arith.constant 19 : index - %c20 = arith.constant 20 : index - %c21 = arith.constant 21 : index - %c22 = arith.constant 22 : index - %c23 = arith.constant 23 : index - %c24 = arith.constant 24 : index - %c25 = arith.constant 25 : index - %c26 = arith.constant 26 : index - %c27 = arith.constant 27 : index - %c28 = arith.constant 28 : index - %c29 = arith.constant 29 : index - %c30 = arith.constant 30 : index - %c31 = arith.constant 31 : index - %v15 = arith.constant 1.500000e+01 : f32 - - %ptr0 = pto.addptr %arg0, %c0 : !pto.ptr -> !pto.ptr - %old0 = pto.atomic_exch %ptr0, %v15 l2cache(nmfv) : !pto.ptr, f32 -> f32 - %ptr1 = pto.addptr %arg0, %c1 : !pto.ptr -> !pto.ptr - %old1 = pto.atomic_exch %ptr1, %v15 l2cache(nmlv) : !pto.ptr, f32 -> f32 - %ptr2 = pto.addptr %arg0, %c2 : !pto.ptr -> !pto.ptr - %old2 = pto.atomic_exch %ptr2, %v15 l2cache(nmprs) : !pto.ptr, f32 -> f32 - %ptr3 = pto.addptr %arg0, %c3 : !pto.ptr -> !pto.ptr - %old3 = pto.atomic_exch %ptr3, %v15 l2cache(nmred) : !pto.ptr, f32 -> f32 - %ptr4 = pto.addptr %arg0, %c4 : !pto.ptr -> !pto.ptr - %old4 = pto.atomic_exch %ptr4, %v15 l2cache(naci) : !pto.ptr, f32 -> f32 - %ptr5 = pto.addptr %arg0, %c5 : !pto.ptr -> !pto.ptr - %old5 = pto.atomic_exch %ptr5, %v15 l2cache(napw) : !pto.ptr, f32 -> f32 - %ptr6 = pto.addptr %arg0, %c6 : !pto.ptr -> !pto.ptr - %old6 = pto.atomic_exch %ptr6, %v15 l2cache(napi) : !pto.ptr, f32 -> f32 - %ptr7 = pto.addptr %arg0, %c7 : !pto.ptr -> !pto.ptr - %old7 = pto.atomic_exch %ptr7, %v15 l2cache(nared) : !pto.ptr, f32 -> f32 - %ptr8 = pto.addptr %arg0, %c8 : !pto.ptr -> !pto.ptr - %old8 = pto.atomic_exch %ptr8, %v15 l2cache(wbhfv) : !pto.ptr, f32 -> f32 - %ptr9 = pto.addptr %arg0, %c9 : !pto.ptr -> !pto.ptr - %old9 = pto.atomic_exch %ptr9, %v15 l2cache(wbhlv) : !pto.ptr, f32 -> f32 - %ptr10 = pto.addptr %arg0, %c10 : !pto.ptr -> !pto.ptr - %old10 = pto.atomic_exch %ptr10, %v15 l2cache(wbhprs) : !pto.ptr, f32 -> f32 - %ptr11 = pto.addptr %arg0, %c11 : !pto.ptr -> !pto.ptr - %old11 = pto.atomic_exch %ptr11, %v15 l2cache(wbhred) : !pto.ptr, f32 -> f32 - %ptr12 = pto.addptr %arg0, %c12 : !pto.ptr -> !pto.ptr - %old12 = pto.atomic_exch %ptr12, %v15 l2cache(wtsfv) : !pto.ptr, f32 -> f32 - %ptr13 = pto.addptr %arg0, %c13 : !pto.ptr -> !pto.ptr - %old13 = pto.atomic_exch %ptr13, %v15 l2cache(wtslv) : !pto.ptr, f32 -> f32 - %ptr14 = pto.addptr %arg0, %c14 : !pto.ptr -> !pto.ptr - %old14 = pto.atomic_exch %ptr14, %v15 l2cache(wtsprs) : !pto.ptr, f32 -> f32 - %ptr15 = pto.addptr %arg0, %c15 : !pto.ptr -> !pto.ptr - %old15 = pto.atomic_exch %ptr15, %v15 l2cache(wtsred) : !pto.ptr, f32 -> f32 - - pto.store %old0, %arg0[%c16] : !pto.ptr, f32 - pto.store %old1, %arg0[%c17] : !pto.ptr, f32 - pto.store %old2, %arg0[%c18] : !pto.ptr, f32 - pto.store %old3, %arg0[%c19] : !pto.ptr, f32 - pto.store %old4, %arg0[%c20] : !pto.ptr, f32 - pto.store %old5, %arg0[%c21] : !pto.ptr, f32 - pto.store %old6, %arg0[%c22] : !pto.ptr, f32 - pto.store %old7, %arg0[%c23] : !pto.ptr, f32 - pto.store %old8, %arg0[%c24] : !pto.ptr, f32 - pto.store %old9, %arg0[%c25] : !pto.ptr, f32 - pto.store %old10, %arg0[%c26] : !pto.ptr, f32 - pto.store %old11, %arg0[%c27] : !pto.ptr, f32 - pto.store %old12, %arg0[%c28] : !pto.ptr, f32 - pto.store %old13, %arg0[%c29] : !pto.ptr, f32 - pto.store %old14, %arg0[%c30] : !pto.ptr, f32 - pto.store %old15, %arg0[%c31] : !pto.ptr, f32 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/simt/simt-atomic-exch-f32-core/launch.cpp b/test/vpto/cases/micro-op/simt/simt-atomic-exch-f32-core/launch.cpp deleted file mode 100644 index 31bca100d..000000000 --- a/test/vpto/cases/micro-op/simt/simt-atomic-exch-f32-core/launch.cpp +++ /dev/null @@ -1,18 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif -#include -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif -extern "C" __global__ [aicore] void simt_atomic_exch_f32_core_kernel(__gm__ float *v1); -void LaunchSimt_atomic_exch_f32_core_kernel(float *v1, void *stream) { - simt_atomic_exch_f32_core_kernel<<<1, nullptr, stream>>>((__gm__ float *)v1); -} diff --git a/test/vpto/cases/micro-op/simt/simt-atomic-exch-f32-core/main.cpp b/test/vpto/cases/micro-op/simt/simt-atomic-exch-f32-core/main.cpp deleted file mode 100644 index f4f7fecaa..000000000 --- a/test/vpto/cases/micro-op/simt/simt-atomic-exch-f32-core/main.cpp +++ /dev/null @@ -1,57 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -#include "test_common.h" -#include "acl/acl.h" -#include -#include - -using namespace PtoTestCommon; - -#define ACL_CHECK(expr) do { const aclError _ret = (expr); if (_ret != ACL_SUCCESS) { std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, (int)_ret, __FILE__, __LINE__); rc = 1; goto cleanup; } } while (0) - -void LaunchSimt_atomic_exch_f32_core_kernel(float *v1, void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(float); - float *v1Host = nullptr; - float *v1Device = nullptr; - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) - deviceId = std::atoi(envDevice); - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchSimt_atomic_exch_f32_core_kernel(v1Device, stream); - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v1Host, fileSize_v1, v1Device, fileSize_v1, ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v1.bin", v1Host, fileSize_v1); - -cleanup: - aclrtFree(v1Device); - aclrtFreeHost(v1Host); - if (stream) - aclrtDestroyStream(stream); - if (deviceSet) - aclrtResetDevice(deviceId); - if (aclInited) - aclFinalize(); - return rc; -} diff --git a/test/vpto/cases/micro-op/simt/simt-atomic-cas-f32-core/compare.py b/test/vpto/cases/micro-op/simt/simt-atomic-f32-core/compare.py similarity index 100% rename from test/vpto/cases/micro-op/simt/simt-atomic-cas-f32-core/compare.py rename to test/vpto/cases/micro-op/simt/simt-atomic-f32-core/compare.py diff --git a/test/vpto/cases/micro-op/simt/simt-atomic-f32-core/golden.py b/test/vpto/cases/micro-op/simt/simt-atomic-f32-core/golden.py new file mode 100644 index 000000000..706ecad4b --- /dev/null +++ b/test/vpto/cases/micro-op/simt/simt-atomic-f32-core/golden.py @@ -0,0 +1,37 @@ +#!/usr/bin/env python3 +# Copyright (c) 2026 Huawei Technologies Co., Ltd. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +import argparse +from pathlib import Path + +import numpy as np + +ELEMS = 32 + + +def generate(output_dir: Path) -> None: + output_dir.mkdir(parents=True, exist_ok=True) + v1 = np.full(ELEMS, -1.0, dtype=np.float32) + golden_v1 = np.full(ELEMS, -1.0, dtype=np.float32) + v1[:5] = np.array([10.0, 10.0, 10.0, 10.0, 11.0], dtype=np.float32) + golden_v1[:5] = np.array([15.0, 5.0, 15.0, 15.0, 11.0], dtype=np.float32) + golden_v1[16:21] = np.array([10.0, 10.0, 10.0, 10.0, 11.0], dtype=np.float32) + v1.tofile(output_dir / "v1.bin") + golden_v1.tofile(output_dir / "golden_v1.bin") + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--output-dir", type=Path, default=Path(".")) + args = parser.parse_args() + generate(args.output_dir) + + +if __name__ == "__main__": + main() diff --git a/test/vpto/cases/micro-op/simt/simt-atomic-f32-core/kernel.pto b/test/vpto/cases/micro-op/simt/simt-atomic-f32-core/kernel.pto new file mode 100644 index 000000000..e68765d01 --- /dev/null +++ b/test/vpto/cases/micro-op/simt/simt-atomic-f32-core/kernel.pto @@ -0,0 +1,41 @@ +module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { + func.func @simt_atomic_f32_core_kernel(%arg0: !pto.ptr) attributes {pto.aicore} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c2 = arith.constant 2 : index + %c3 = arith.constant 3 : index + %c4 = arith.constant 4 : index + %c5 = arith.constant 5 : index + %c6 = arith.constant 6 : index + %c7 = arith.constant 7 : index + %c8 = arith.constant 8 : index + %c9 = arith.constant 9 : index + %c16 = arith.constant 16 : index + %c17 = arith.constant 17 : index + %c18 = arith.constant 18 : index + %c19 = arith.constant 19 : index + %c20 = arith.constant 20 : index + %v5 = arith.constant 5.000000e+00 : f32 + %v10 = arith.constant 1.000000e+01 : f32 + %v15 = arith.constant 1.500000e+01 : f32 + + %ptr_max = pto.addptr %arg0, %c0 : !pto.ptr -> !pto.ptr + %old_max = pto.atomic_max %ptr_max, %v15 l2cache(nmfv) : !pto.ptr, f32 -> f32 + %ptr_min = pto.addptr %arg0, %c1 : !pto.ptr -> !pto.ptr + %old_min = pto.atomic_min %ptr_min, %v5 l2cache(nmfv) : !pto.ptr, f32 -> f32 + %ptr_exch = pto.addptr %arg0, %c2 : !pto.ptr -> !pto.ptr + %old_exch = pto.atomic_exch %ptr_exch, %v15 l2cache(nmfv) : !pto.ptr, f32 -> f32 + %ptr_cas_success = pto.addptr %arg0, %c3 : !pto.ptr -> !pto.ptr + %old_cas_success = pto.atomic_cas %ptr_cas_success, %v10, %v15 l2cache(nmfv) : !pto.ptr, f32 -> f32 + %ptr_cas_fail = pto.addptr %arg0, %c4 : !pto.ptr -> !pto.ptr + %old_cas_fail = pto.atomic_cas %ptr_cas_fail, %v10, %v15 l2cache(nmfv) : !pto.ptr, f32 -> f32 + + pto.store %old_max, %arg0[%c16] : !pto.ptr, f32 + pto.store %old_min, %arg0[%c17] : !pto.ptr, f32 + pto.store %old_exch, %arg0[%c18] : !pto.ptr, f32 + pto.store %old_cas_success, %arg0[%c19] : !pto.ptr, f32 + pto.store %old_cas_fail, %arg0[%c20] : !pto.ptr, f32 + pto.barrier #pto.pipe + return + } +} diff --git a/test/vpto/cases/micro-op/simt/simt-atomic-f32-core/launch.cpp b/test/vpto/cases/micro-op/simt/simt-atomic-f32-core/launch.cpp new file mode 100644 index 000000000..6c92ea7da --- /dev/null +++ b/test/vpto/cases/micro-op/simt/simt-atomic-f32-core/launch.cpp @@ -0,0 +1,18 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. +#ifndef __VEC_SCOPE__ +#define __VEC_SCOPE__ +#endif +#include +#ifndef __CPU_SIM +#include "acl/acl.h" +#endif +extern "C" __global__ [aicore] void simt_atomic_f32_core_kernel(__gm__ float *v1); +void LaunchSimt_atomic_f32_core_kernel(float *v1, void *stream) { + simt_atomic_f32_core_kernel<<<1, nullptr, stream>>>((__gm__ float *)v1); +} diff --git a/test/vpto/cases/micro-op/simt/simt-atomic-f32-core/main.cpp b/test/vpto/cases/micro-op/simt/simt-atomic-f32-core/main.cpp new file mode 100644 index 000000000..70753d909 --- /dev/null +++ b/test/vpto/cases/micro-op/simt/simt-atomic-f32-core/main.cpp @@ -0,0 +1,57 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +#include "test_common.h" +#include "acl/acl.h" +#include +#include + +using namespace PtoTestCommon; + +#define ACL_CHECK(expr) do { const aclError _ret = (expr); if (_ret != ACL_SUCCESS) { std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, (int)_ret, __FILE__, __LINE__); rc = 1; goto cleanup; } } while (0) + +void LaunchSimt_atomic_f32_core_kernel(float *v1, void *stream); + +int main() { + size_t elemCount_v1 = 32; + size_t fileSize_v1 = elemCount_v1 * sizeof(float); + float *v1Host = nullptr; + float *v1Device = nullptr; + int rc = 0; + bool aclInited = false; + bool deviceSet = false; + int deviceId = 0; + aclrtStream stream = nullptr; + + ACL_CHECK(aclInit(nullptr)); + aclInited = true; + if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) + deviceId = std::atoi(envDevice); + ACL_CHECK(aclrtSetDevice(deviceId)); + deviceSet = true; + ACL_CHECK(aclrtCreateStream(&stream)); + ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); + ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); + ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); + ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, ACL_MEMCPY_HOST_TO_DEVICE)); + LaunchSimt_atomic_f32_core_kernel(v1Device, stream); + ACL_CHECK(aclrtSynchronizeStream(stream)); + ACL_CHECK(aclrtMemcpy(v1Host, fileSize_v1, v1Device, fileSize_v1, ACL_MEMCPY_DEVICE_TO_HOST)); + WriteFile("./v1.bin", v1Host, fileSize_v1); + +cleanup: + aclrtFree(v1Device); + aclrtFreeHost(v1Host); + if (stream) + aclrtDestroyStream(stream); + if (deviceSet) + aclrtResetDevice(deviceId); + if (aclInited) + aclFinalize(); + return rc; +} diff --git a/test/vpto/cases/micro-op/simt/simt-atomic-f32-mode-core/golden.py b/test/vpto/cases/micro-op/simt/simt-atomic-f32-mode-core/golden.py index ace786beb..a16f987d7 100644 --- a/test/vpto/cases/micro-op/simt/simt-atomic-f32-mode-core/golden.py +++ b/test/vpto/cases/micro-op/simt/simt-atomic-f32-mode-core/golden.py @@ -12,20 +12,21 @@ import numpy as np -ELEMS = 1024 +F32_ELEMS = 32 +PACKED_ELEMS = 16 def generate(output_dir: Path) -> None: output_dir.mkdir(parents=True, exist_ok=True) - v1 = np.full(ELEMS, -1.0, dtype=np.float32) - golden_v1 = np.full(ELEMS, -1.0, dtype=np.float32) - v2 = np.full(ELEMS, 0xBC00, dtype=np.uint16) - v3 = np.full(ELEMS, 0xBF80, dtype=np.uint16) + v1 = np.full(F32_ELEMS, -1.0, dtype=np.float32) + golden_v1 = np.full(F32_ELEMS, -1.0, dtype=np.float32) + v2 = np.full(PACKED_ELEMS, 0xBC00, dtype=np.uint16) + v3 = np.full(PACKED_ELEMS, 0xBF80, dtype=np.uint16) golden_v2 = v2.copy() golden_v3 = v3.copy() - v1[:16] = np.full(16, 10.0, dtype=np.float32) - golden_v1[:16] = np.full(16, 15.0, dtype=np.float32) - golden_v1[16:32] = np.full(16, 10.0, dtype=np.float32) + v1[:4] = np.full(4, 10.0, dtype=np.float32) + golden_v1[:4] = np.full(4, 15.0, dtype=np.float32) + golden_v1[16:20] = np.full(4, 10.0, dtype=np.float32) v2[:2] = np.array([0x3C00, 0x4000], dtype=np.uint16) # f16: 1.0, 2.0 v3[:2] = np.array([0x3F80, 0x4040], dtype=np.uint16) # bf16: 1.0, 3.0 golden_v2[:2] = np.array([0x4000, 0x4200], dtype=np.uint16) # 2.0, 3.0 diff --git a/test/vpto/cases/micro-op/simt/simt-atomic-f32-mode-core/kernel.pto b/test/vpto/cases/micro-op/simt/simt-atomic-f32-mode-core/kernel.pto index 114db135e..afc35c6c4 100644 --- a/test/vpto/cases/micro-op/simt/simt-atomic-f32-mode-core/kernel.pto +++ b/test/vpto/cases/micro-op/simt/simt-atomic-f32-mode-core/kernel.pto @@ -10,30 +10,10 @@ module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind -> !pto.ptr %old0 = pto.atomic_add %ptr0, %v5 l2cache(nmfv) : !pto.ptr, f32 -> f32 %ptr1 = pto.addptr %arg0, %c1 : !pto.ptr -> !pto.ptr - %old1 = pto.atomic_add %ptr1, %v5 l2cache(nmlv) : !pto.ptr, f32 -> f32 + %old1 = pto.atomic_add %ptr1, %v5 l2cache(naci) : !pto.ptr, f32 -> f32 %ptr2 = pto.addptr %arg0, %c2 : !pto.ptr -> !pto.ptr - %old2 = pto.atomic_add %ptr2, %v5 l2cache(nmprs) : !pto.ptr, f32 -> f32 + %old2 = pto.atomic_add %ptr2, %v5 l2cache(wbhred) : !pto.ptr, f32 -> f32 %ptr3 = pto.addptr %arg0, %c3 : !pto.ptr -> !pto.ptr - %old3 = pto.atomic_add %ptr3, %v5 l2cache(nmred) : !pto.ptr, f32 -> f32 - %ptr4 = pto.addptr %arg0, %c4 : !pto.ptr -> !pto.ptr - %old4 = pto.atomic_add %ptr4, %v5 l2cache(naci) : !pto.ptr, f32 -> f32 - %ptr5 = pto.addptr %arg0, %c5 : !pto.ptr -> !pto.ptr - %old5 = pto.atomic_add %ptr5, %v5 l2cache(napw) : !pto.ptr, f32 -> f32 - %ptr6 = pto.addptr %arg0, %c6 : !pto.ptr -> !pto.ptr - %old6 = pto.atomic_add %ptr6, %v5 l2cache(napi) : !pto.ptr, f32 -> f32 - %ptr7 = pto.addptr %arg0, %c7 : !pto.ptr -> !pto.ptr - %old7 = pto.atomic_add %ptr7, %v5 l2cache(nared) : !pto.ptr, f32 -> f32 - %ptr8 = pto.addptr %arg0, %c8 : !pto.ptr -> !pto.ptr - %old8 = pto.atomic_add %ptr8, %v5 l2cache(wbhfv) : !pto.ptr, f32 -> f32 - %ptr9 = pto.addptr %arg0, %c9 : !pto.ptr -> !pto.ptr - %old9 = pto.atomic_add %ptr9, %v5 l2cache(wbhlv) : !pto.ptr, f32 -> f32 - %ptr10 = pto.addptr %arg0, %c10 : !pto.ptr -> !pto.ptr - %old10 = pto.atomic_add %ptr10, %v5 l2cache(wbhprs) : !pto.ptr, f32 -> f32 - %ptr11 = pto.addptr %arg0, %c11 : !pto.ptr -> !pto.ptr - %old11 = pto.atomic_add %ptr11, %v5 l2cache(wbhred) : !pto.ptr, f32 -> f32 - %ptr12 = pto.addptr %arg0, %c12 : !pto.ptr -> !pto.ptr - %old12 = pto.atomic_add %ptr12, %v5 l2cache(wtsfv) : !pto.ptr, f32 -> f32 - %ptr13 = pto.addptr %arg0, %c13 : !pto.ptr -> !pto.ptr - %old13 = pto.atomic_add %ptr13, %v5 l2cache(wtslv) : !pto.ptr, f32 -> f32 - %ptr14 = pto.addptr %arg0, %c14 : !pto.ptr -> !pto.ptr - %old14 = pto.atomic_add %ptr14, %v5 l2cache(wtsprs) : !pto.ptr, f32 -> f32 - %ptr15 = pto.addptr %arg0, %c15 : !pto.ptr -> !pto.ptr - %old15 = pto.atomic_add %ptr15, %v5 l2cache(wtsred) : !pto.ptr, f32 -> f32 + %old3 = pto.atomic_add %ptr3, %v5 l2cache(wtsred) : !pto.ptr, f32 -> f32 pto.store %old0, %arg0[%c16] : !pto.ptr, f32 pto.store %old1, %arg0[%c17] : !pto.ptr, f32 pto.store %old2, %arg0[%c18] : !pto.ptr, f32 pto.store %old3, %arg0[%c19] : !pto.ptr, f32 - pto.store %old4, %arg0[%c20] : !pto.ptr, f32 - pto.store %old5, %arg0[%c21] : !pto.ptr, f32 - pto.store %old6, %arg0[%c22] : !pto.ptr, f32 - pto.store %old7, %arg0[%c23] : !pto.ptr, f32 - pto.store %old8, %arg0[%c24] : !pto.ptr, f32 - pto.store %old9, %arg0[%c25] : !pto.ptr, f32 - pto.store %old10, %arg0[%c26] : !pto.ptr, f32 - pto.store %old11, %arg0[%c27] : !pto.ptr, f32 - pto.store %old12, %arg0[%c28] : !pto.ptr, f32 - pto.store %old13, %arg0[%c29] : !pto.ptr, f32 - pto.store %old14, %arg0[%c30] : !pto.ptr, f32 - pto.store %old15, %arg0[%c31] : !pto.ptr, f32 %h_ptr0 = pto.addptr %arg1, %c0 : !pto.ptr -> !pto.ptr %h_ptr1 = pto.addptr %arg1, %c1 : !pto.ptr -> !pto.ptr diff --git a/test/vpto/cases/micro-op/simt/simt-atomic-f32-mode-core/main.cpp b/test/vpto/cases/micro-op/simt/simt-atomic-f32-mode-core/main.cpp index 4d306f6f2..1c95acfb3 100644 --- a/test/vpto/cases/micro-op/simt/simt-atomic-f32-mode-core/main.cpp +++ b/test/vpto/cases/micro-op/simt/simt-atomic-f32-mode-core/main.cpp @@ -19,9 +19,9 @@ void LaunchSimt_atomic_f32_mode_core_kernel(float *v1, uint16_t *v2, uint16_t *v3, void *stream); int main() { - size_t elemCount_v1 = 1024; + size_t elemCount_v1 = 32; size_t fileSize_v1 = elemCount_v1 * sizeof(float); - size_t elemCount_v2 = 1024; + size_t elemCount_v2 = 16; size_t fileSize_v2 = elemCount_v2 * sizeof(uint16_t); float *v1Host = nullptr; uint16_t *v2Host = nullptr; diff --git a/test/vpto/cases/micro-op/simt/simt-atomic-max-f32-core/compare.py b/test/vpto/cases/micro-op/simt/simt-atomic-max-f32-core/compare.py deleted file mode 100644 index 4766e865b..000000000 --- a/test/vpto/cases/micro-op/simt/simt-atomic-max-f32-core/compare.py +++ /dev/null @@ -1,33 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -import os -import sys - -import numpy as np - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - golden = np.fromfile("golden_v1.bin", dtype=np.float32) - out = np.fromfile("v1.bin", dtype=np.float32) - ok = golden.shape == out.shape and np.array_equal(golden, out) - if not ok: - idxs = np.nonzero(golden != out)[0] - idx = int(idxs[0]) if idxs.size else 0 - print( - f"[ERROR] mismatch at idx={idx}, golden={float(golden[idx])}, out={float(out[idx])}" - ) - if strict: - sys.exit(2) - print("[INFO] compare passed" if ok else "[WARN] compare failed (non-gating)") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/simt/simt-atomic-max-f32-core/golden.py b/test/vpto/cases/micro-op/simt/simt-atomic-max-f32-core/golden.py deleted file mode 100644 index 403c392e5..000000000 --- a/test/vpto/cases/micro-op/simt/simt-atomic-max-f32-core/golden.py +++ /dev/null @@ -1,37 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -import argparse -from pathlib import Path - -import numpy as np - -ELEMS = 1024 - - -def generate(output_dir: Path) -> None: - output_dir.mkdir(parents=True, exist_ok=True) - v1 = np.full(ELEMS, -1.0, dtype=np.float32) - golden_v1 = np.full(ELEMS, -1.0, dtype=np.float32) - v1[:16] = np.full(16, 10.0, dtype=np.float32) - golden_v1[:16] = np.full(16, 15.0, dtype=np.float32) - golden_v1[16:32] = np.full(16, 10.0, dtype=np.float32) - v1.tofile(output_dir / "v1.bin") - golden_v1.tofile(output_dir / "golden_v1.bin") - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument("--output-dir", type=Path, default=Path(".")) - args = parser.parse_args() - generate(args.output_dir) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/simt/simt-atomic-max-f32-core/kernel.pto b/test/vpto/cases/micro-op/simt/simt-atomic-max-f32-core/kernel.pto deleted file mode 100644 index 442273cae..000000000 --- a/test/vpto/cases/micro-op/simt/simt-atomic-max-f32-core/kernel.pto +++ /dev/null @@ -1,89 +0,0 @@ -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @simt_atomic_max_f32_core_kernel(%arg0: !pto.ptr) attributes {pto.aicore} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c2 = arith.constant 2 : index - %c3 = arith.constant 3 : index - %c4 = arith.constant 4 : index - %c5 = arith.constant 5 : index - %c6 = arith.constant 6 : index - %c7 = arith.constant 7 : index - %c8 = arith.constant 8 : index - %c9 = arith.constant 9 : index - %c10 = arith.constant 10 : index - %c11 = arith.constant 11 : index - %c12 = arith.constant 12 : index - %c13 = arith.constant 13 : index - %c14 = arith.constant 14 : index - %c15 = arith.constant 15 : index - %c16 = arith.constant 16 : index - %c17 = arith.constant 17 : index - %c18 = arith.constant 18 : index - %c19 = arith.constant 19 : index - %c20 = arith.constant 20 : index - %c21 = arith.constant 21 : index - %c22 = arith.constant 22 : index - %c23 = arith.constant 23 : index - %c24 = arith.constant 24 : index - %c25 = arith.constant 25 : index - %c26 = arith.constant 26 : index - %c27 = arith.constant 27 : index - %c28 = arith.constant 28 : index - %c29 = arith.constant 29 : index - %c30 = arith.constant 30 : index - %c31 = arith.constant 31 : index - %v15 = arith.constant 1.500000e+01 : f32 - - %ptr0 = pto.addptr %arg0, %c0 : !pto.ptr -> !pto.ptr - %old0 = pto.atomic_max %ptr0, %v15 l2cache(nmfv) : !pto.ptr, f32 -> f32 - %ptr1 = pto.addptr %arg0, %c1 : !pto.ptr -> !pto.ptr - %old1 = pto.atomic_max %ptr1, %v15 l2cache(nmlv) : !pto.ptr, f32 -> f32 - %ptr2 = pto.addptr %arg0, %c2 : !pto.ptr -> !pto.ptr - %old2 = pto.atomic_max %ptr2, %v15 l2cache(nmprs) : !pto.ptr, f32 -> f32 - %ptr3 = pto.addptr %arg0, %c3 : !pto.ptr -> !pto.ptr - %old3 = pto.atomic_max %ptr3, %v15 l2cache(nmred) : !pto.ptr, f32 -> f32 - %ptr4 = pto.addptr %arg0, %c4 : !pto.ptr -> !pto.ptr - %old4 = pto.atomic_max %ptr4, %v15 l2cache(naci) : !pto.ptr, f32 -> f32 - %ptr5 = pto.addptr %arg0, %c5 : !pto.ptr -> !pto.ptr - %old5 = pto.atomic_max %ptr5, %v15 l2cache(napw) : !pto.ptr, f32 -> f32 - %ptr6 = pto.addptr %arg0, %c6 : !pto.ptr -> !pto.ptr - %old6 = pto.atomic_max %ptr6, %v15 l2cache(napi) : !pto.ptr, f32 -> f32 - %ptr7 = pto.addptr %arg0, %c7 : !pto.ptr -> !pto.ptr - %old7 = pto.atomic_max %ptr7, %v15 l2cache(nared) : !pto.ptr, f32 -> f32 - %ptr8 = pto.addptr %arg0, %c8 : !pto.ptr -> !pto.ptr - %old8 = pto.atomic_max %ptr8, %v15 l2cache(wbhfv) : !pto.ptr, f32 -> f32 - %ptr9 = pto.addptr %arg0, %c9 : !pto.ptr -> !pto.ptr - %old9 = pto.atomic_max %ptr9, %v15 l2cache(wbhlv) : !pto.ptr, f32 -> f32 - %ptr10 = pto.addptr %arg0, %c10 : !pto.ptr -> !pto.ptr - %old10 = pto.atomic_max %ptr10, %v15 l2cache(wbhprs) : !pto.ptr, f32 -> f32 - %ptr11 = pto.addptr %arg0, %c11 : !pto.ptr -> !pto.ptr - %old11 = pto.atomic_max %ptr11, %v15 l2cache(wbhred) : !pto.ptr, f32 -> f32 - %ptr12 = pto.addptr %arg0, %c12 : !pto.ptr -> !pto.ptr - %old12 = pto.atomic_max %ptr12, %v15 l2cache(wtsfv) : !pto.ptr, f32 -> f32 - %ptr13 = pto.addptr %arg0, %c13 : !pto.ptr -> !pto.ptr - %old13 = pto.atomic_max %ptr13, %v15 l2cache(wtslv) : !pto.ptr, f32 -> f32 - %ptr14 = pto.addptr %arg0, %c14 : !pto.ptr -> !pto.ptr - %old14 = pto.atomic_max %ptr14, %v15 l2cache(wtsprs) : !pto.ptr, f32 -> f32 - %ptr15 = pto.addptr %arg0, %c15 : !pto.ptr -> !pto.ptr - %old15 = pto.atomic_max %ptr15, %v15 l2cache(wtsred) : !pto.ptr, f32 -> f32 - - pto.store %old0, %arg0[%c16] : !pto.ptr, f32 - pto.store %old1, %arg0[%c17] : !pto.ptr, f32 - pto.store %old2, %arg0[%c18] : !pto.ptr, f32 - pto.store %old3, %arg0[%c19] : !pto.ptr, f32 - pto.store %old4, %arg0[%c20] : !pto.ptr, f32 - pto.store %old5, %arg0[%c21] : !pto.ptr, f32 - pto.store %old6, %arg0[%c22] : !pto.ptr, f32 - pto.store %old7, %arg0[%c23] : !pto.ptr, f32 - pto.store %old8, %arg0[%c24] : !pto.ptr, f32 - pto.store %old9, %arg0[%c25] : !pto.ptr, f32 - pto.store %old10, %arg0[%c26] : !pto.ptr, f32 - pto.store %old11, %arg0[%c27] : !pto.ptr, f32 - pto.store %old12, %arg0[%c28] : !pto.ptr, f32 - pto.store %old13, %arg0[%c29] : !pto.ptr, f32 - pto.store %old14, %arg0[%c30] : !pto.ptr, f32 - pto.store %old15, %arg0[%c31] : !pto.ptr, f32 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/simt/simt-atomic-max-f32-core/launch.cpp b/test/vpto/cases/micro-op/simt/simt-atomic-max-f32-core/launch.cpp deleted file mode 100644 index a7233f194..000000000 --- a/test/vpto/cases/micro-op/simt/simt-atomic-max-f32-core/launch.cpp +++ /dev/null @@ -1,18 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif -#include -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif -extern "C" __global__ [aicore] void simt_atomic_max_f32_core_kernel(__gm__ float *v1); -void LaunchSimt_atomic_max_f32_core_kernel(float *v1, void *stream) { - simt_atomic_max_f32_core_kernel<<<1, nullptr, stream>>>((__gm__ float *)v1); -} diff --git a/test/vpto/cases/micro-op/simt/simt-atomic-max-f32-core/main.cpp b/test/vpto/cases/micro-op/simt/simt-atomic-max-f32-core/main.cpp deleted file mode 100644 index a1431134d..000000000 --- a/test/vpto/cases/micro-op/simt/simt-atomic-max-f32-core/main.cpp +++ /dev/null @@ -1,57 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -#include "test_common.h" -#include "acl/acl.h" -#include -#include - -using namespace PtoTestCommon; - -#define ACL_CHECK(expr) do { const aclError _ret = (expr); if (_ret != ACL_SUCCESS) { std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, (int)_ret, __FILE__, __LINE__); rc = 1; goto cleanup; } } while (0) - -void LaunchSimt_atomic_max_f32_core_kernel(float *v1, void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(float); - float *v1Host = nullptr; - float *v1Device = nullptr; - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) - deviceId = std::atoi(envDevice); - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchSimt_atomic_max_f32_core_kernel(v1Device, stream); - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v1Host, fileSize_v1, v1Device, fileSize_v1, ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v1.bin", v1Host, fileSize_v1); - -cleanup: - aclrtFree(v1Device); - aclrtFreeHost(v1Host); - if (stream) - aclrtDestroyStream(stream); - if (deviceSet) - aclrtResetDevice(deviceId); - if (aclInited) - aclFinalize(); - return rc; -} diff --git a/test/vpto/cases/micro-op/simt/simt-atomic-min-f32-core/compare.py b/test/vpto/cases/micro-op/simt/simt-atomic-min-f32-core/compare.py deleted file mode 100644 index 4766e865b..000000000 --- a/test/vpto/cases/micro-op/simt/simt-atomic-min-f32-core/compare.py +++ /dev/null @@ -1,33 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -import os -import sys - -import numpy as np - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - golden = np.fromfile("golden_v1.bin", dtype=np.float32) - out = np.fromfile("v1.bin", dtype=np.float32) - ok = golden.shape == out.shape and np.array_equal(golden, out) - if not ok: - idxs = np.nonzero(golden != out)[0] - idx = int(idxs[0]) if idxs.size else 0 - print( - f"[ERROR] mismatch at idx={idx}, golden={float(golden[idx])}, out={float(out[idx])}" - ) - if strict: - sys.exit(2) - print("[INFO] compare passed" if ok else "[WARN] compare failed (non-gating)") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/simt/simt-atomic-min-f32-core/golden.py b/test/vpto/cases/micro-op/simt/simt-atomic-min-f32-core/golden.py deleted file mode 100644 index b69c80a90..000000000 --- a/test/vpto/cases/micro-op/simt/simt-atomic-min-f32-core/golden.py +++ /dev/null @@ -1,37 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -import argparse -from pathlib import Path - -import numpy as np - -ELEMS = 1024 - - -def generate(output_dir: Path) -> None: - output_dir.mkdir(parents=True, exist_ok=True) - v1 = np.full(ELEMS, -1.0, dtype=np.float32) - golden_v1 = np.full(ELEMS, -1.0, dtype=np.float32) - v1[:16] = np.full(16, 10.0, dtype=np.float32) - golden_v1[:16] = np.full(16, 5.0, dtype=np.float32) - golden_v1[16:32] = np.full(16, 10.0, dtype=np.float32) - v1.tofile(output_dir / "v1.bin") - golden_v1.tofile(output_dir / "golden_v1.bin") - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument("--output-dir", type=Path, default=Path(".")) - args = parser.parse_args() - generate(args.output_dir) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/simt/simt-atomic-min-f32-core/kernel.pto b/test/vpto/cases/micro-op/simt/simt-atomic-min-f32-core/kernel.pto deleted file mode 100644 index 6af11469a..000000000 --- a/test/vpto/cases/micro-op/simt/simt-atomic-min-f32-core/kernel.pto +++ /dev/null @@ -1,89 +0,0 @@ -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @simt_atomic_min_f32_core_kernel(%arg0: !pto.ptr) attributes {pto.aicore} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c2 = arith.constant 2 : index - %c3 = arith.constant 3 : index - %c4 = arith.constant 4 : index - %c5 = arith.constant 5 : index - %c6 = arith.constant 6 : index - %c7 = arith.constant 7 : index - %c8 = arith.constant 8 : index - %c9 = arith.constant 9 : index - %c10 = arith.constant 10 : index - %c11 = arith.constant 11 : index - %c12 = arith.constant 12 : index - %c13 = arith.constant 13 : index - %c14 = arith.constant 14 : index - %c15 = arith.constant 15 : index - %c16 = arith.constant 16 : index - %c17 = arith.constant 17 : index - %c18 = arith.constant 18 : index - %c19 = arith.constant 19 : index - %c20 = arith.constant 20 : index - %c21 = arith.constant 21 : index - %c22 = arith.constant 22 : index - %c23 = arith.constant 23 : index - %c24 = arith.constant 24 : index - %c25 = arith.constant 25 : index - %c26 = arith.constant 26 : index - %c27 = arith.constant 27 : index - %c28 = arith.constant 28 : index - %c29 = arith.constant 29 : index - %c30 = arith.constant 30 : index - %c31 = arith.constant 31 : index - %v5 = arith.constant 5.000000e+00 : f32 - - %ptr0 = pto.addptr %arg0, %c0 : !pto.ptr -> !pto.ptr - %old0 = pto.atomic_min %ptr0, %v5 l2cache(nmfv) : !pto.ptr, f32 -> f32 - %ptr1 = pto.addptr %arg0, %c1 : !pto.ptr -> !pto.ptr - %old1 = pto.atomic_min %ptr1, %v5 l2cache(nmlv) : !pto.ptr, f32 -> f32 - %ptr2 = pto.addptr %arg0, %c2 : !pto.ptr -> !pto.ptr - %old2 = pto.atomic_min %ptr2, %v5 l2cache(nmprs) : !pto.ptr, f32 -> f32 - %ptr3 = pto.addptr %arg0, %c3 : !pto.ptr -> !pto.ptr - %old3 = pto.atomic_min %ptr3, %v5 l2cache(nmred) : !pto.ptr, f32 -> f32 - %ptr4 = pto.addptr %arg0, %c4 : !pto.ptr -> !pto.ptr - %old4 = pto.atomic_min %ptr4, %v5 l2cache(naci) : !pto.ptr, f32 -> f32 - %ptr5 = pto.addptr %arg0, %c5 : !pto.ptr -> !pto.ptr - %old5 = pto.atomic_min %ptr5, %v5 l2cache(napw) : !pto.ptr, f32 -> f32 - %ptr6 = pto.addptr %arg0, %c6 : !pto.ptr -> !pto.ptr - %old6 = pto.atomic_min %ptr6, %v5 l2cache(napi) : !pto.ptr, f32 -> f32 - %ptr7 = pto.addptr %arg0, %c7 : !pto.ptr -> !pto.ptr - %old7 = pto.atomic_min %ptr7, %v5 l2cache(nared) : !pto.ptr, f32 -> f32 - %ptr8 = pto.addptr %arg0, %c8 : !pto.ptr -> !pto.ptr - %old8 = pto.atomic_min %ptr8, %v5 l2cache(wbhfv) : !pto.ptr, f32 -> f32 - %ptr9 = pto.addptr %arg0, %c9 : !pto.ptr -> !pto.ptr - %old9 = pto.atomic_min %ptr9, %v5 l2cache(wbhlv) : !pto.ptr, f32 -> f32 - %ptr10 = pto.addptr %arg0, %c10 : !pto.ptr -> !pto.ptr - %old10 = pto.atomic_min %ptr10, %v5 l2cache(wbhprs) : !pto.ptr, f32 -> f32 - %ptr11 = pto.addptr %arg0, %c11 : !pto.ptr -> !pto.ptr - %old11 = pto.atomic_min %ptr11, %v5 l2cache(wbhred) : !pto.ptr, f32 -> f32 - %ptr12 = pto.addptr %arg0, %c12 : !pto.ptr -> !pto.ptr - %old12 = pto.atomic_min %ptr12, %v5 l2cache(wtsfv) : !pto.ptr, f32 -> f32 - %ptr13 = pto.addptr %arg0, %c13 : !pto.ptr -> !pto.ptr - %old13 = pto.atomic_min %ptr13, %v5 l2cache(wtslv) : !pto.ptr, f32 -> f32 - %ptr14 = pto.addptr %arg0, %c14 : !pto.ptr -> !pto.ptr - %old14 = pto.atomic_min %ptr14, %v5 l2cache(wtsprs) : !pto.ptr, f32 -> f32 - %ptr15 = pto.addptr %arg0, %c15 : !pto.ptr -> !pto.ptr - %old15 = pto.atomic_min %ptr15, %v5 l2cache(wtsred) : !pto.ptr, f32 -> f32 - - pto.store %old0, %arg0[%c16] : !pto.ptr, f32 - pto.store %old1, %arg0[%c17] : !pto.ptr, f32 - pto.store %old2, %arg0[%c18] : !pto.ptr, f32 - pto.store %old3, %arg0[%c19] : !pto.ptr, f32 - pto.store %old4, %arg0[%c20] : !pto.ptr, f32 - pto.store %old5, %arg0[%c21] : !pto.ptr, f32 - pto.store %old6, %arg0[%c22] : !pto.ptr, f32 - pto.store %old7, %arg0[%c23] : !pto.ptr, f32 - pto.store %old8, %arg0[%c24] : !pto.ptr, f32 - pto.store %old9, %arg0[%c25] : !pto.ptr, f32 - pto.store %old10, %arg0[%c26] : !pto.ptr, f32 - pto.store %old11, %arg0[%c27] : !pto.ptr, f32 - pto.store %old12, %arg0[%c28] : !pto.ptr, f32 - pto.store %old13, %arg0[%c29] : !pto.ptr, f32 - pto.store %old14, %arg0[%c30] : !pto.ptr, f32 - pto.store %old15, %arg0[%c31] : !pto.ptr, f32 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/simt/simt-atomic-min-f32-core/launch.cpp b/test/vpto/cases/micro-op/simt/simt-atomic-min-f32-core/launch.cpp deleted file mode 100644 index 0d129ddbf..000000000 --- a/test/vpto/cases/micro-op/simt/simt-atomic-min-f32-core/launch.cpp +++ /dev/null @@ -1,19 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif -#include -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif -extern "C" __global__ [aicore] void simt_atomic_min_f32_core_kernel(__gm__ float *v1); -void LaunchSimt_atomic_min_f32_core_kernel(float *v1, void *stream) { - simt_atomic_min_f32_core_kernel<<<1, nullptr, stream>>>((__gm__ float *)v1); -} diff --git a/test/vpto/cases/micro-op/simt/simt-atomic-min-f32-core/main.cpp b/test/vpto/cases/micro-op/simt/simt-atomic-min-f32-core/main.cpp deleted file mode 100644 index 818755921..000000000 --- a/test/vpto/cases/micro-op/simt/simt-atomic-min-f32-core/main.cpp +++ /dev/null @@ -1,57 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -#include "test_common.h" -#include "acl/acl.h" -#include -#include - -using namespace PtoTestCommon; - -#define ACL_CHECK(expr) do { const aclError _ret = (expr); if (_ret != ACL_SUCCESS) { std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, (int)_ret, __FILE__, __LINE__); rc = 1; goto cleanup; } } while (0) - -void LaunchSimt_atomic_min_f32_core_kernel(float *v1, void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(float); - float *v1Host = nullptr; - float *v1Device = nullptr; - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) - deviceId = std::atoi(envDevice); - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchSimt_atomic_min_f32_core_kernel(v1Device, stream); - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v1Host, fileSize_v1, v1Device, fileSize_v1, ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v1.bin", v1Host, fileSize_v1); - -cleanup: - aclrtFree(v1Device); - aclrtFreeHost(v1Host); - if (stream) - aclrtDestroyStream(stream); - if (deviceSet) - aclrtResetDevice(deviceId); - if (aclInited) - aclFinalize(); - return rc; -} diff --git a/test/vpto/cases/micro-op/simt/simt-atomic-mode-core/golden.py b/test/vpto/cases/micro-op/simt/simt-atomic-mode-core/golden.py index 6418fdba0..7e69f3c6e 100644 --- a/test/vpto/cases/micro-op/simt/simt-atomic-mode-core/golden.py +++ b/test/vpto/cases/micro-op/simt/simt-atomic-mode-core/golden.py @@ -12,16 +12,16 @@ import numpy as np -ELEMS = 1024 +ELEMS = 32 def generate(output_dir: Path) -> None: output_dir.mkdir(parents=True, exist_ok=True) v1 = np.full(ELEMS, -1, dtype=np.int32) golden_v1 = np.full(ELEMS, -1, dtype=np.int32) - v1[:16] = np.full(16, 10, dtype=np.int32) - golden_v1[:16] = np.full(16, 15, dtype=np.int32) - golden_v1[16:32] = np.full(16, 10, dtype=np.int32) + v1[:4] = np.full(4, 10, dtype=np.int32) + golden_v1[:4] = np.full(4, 15, dtype=np.int32) + golden_v1[16:20] = np.full(4, 10, dtype=np.int32) v1.tofile(output_dir / "v1.bin") golden_v1.tofile(output_dir / "golden_v1.bin") diff --git a/test/vpto/cases/micro-op/simt/simt-atomic-mode-core/kernel.pto b/test/vpto/cases/micro-op/simt/simt-atomic-mode-core/kernel.pto index 704582695..b85a2d0b2 100644 --- a/test/vpto/cases/micro-op/simt/simt-atomic-mode-core/kernel.pto +++ b/test/vpto/cases/micro-op/simt/simt-atomic-mode-core/kernel.pto @@ -8,81 +8,25 @@ module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind -> !pto.ptr %old0 = pto.atomic_add %ptr0, %v5 l2cache(nmfv) signed : !pto.ptr, i32 -> i32 %ptr1 = pto.addptr %arg0, %c1 : !pto.ptr -> !pto.ptr - %old1 = pto.atomic_add %ptr1, %v5 l2cache(nmlv) signed : !pto.ptr, i32 -> i32 + %old1 = pto.atomic_add %ptr1, %v5 l2cache(naci) signed : !pto.ptr, i32 -> i32 %ptr2 = pto.addptr %arg0, %c2 : !pto.ptr -> !pto.ptr - %old2 = pto.atomic_add %ptr2, %v5 l2cache(nmprs) signed : !pto.ptr, i32 -> i32 + %old2 = pto.atomic_add %ptr2, %v5 l2cache(wbhred) signed : !pto.ptr, i32 -> i32 %ptr3 = pto.addptr %arg0, %c3 : !pto.ptr -> !pto.ptr - %old3 = pto.atomic_add %ptr3, %v5 l2cache(nmred) signed : !pto.ptr, i32 -> i32 - %ptr4 = pto.addptr %arg0, %c4 : !pto.ptr -> !pto.ptr - %old4 = pto.atomic_add %ptr4, %v5 l2cache(naci) signed : !pto.ptr, i32 -> i32 - %ptr5 = pto.addptr %arg0, %c5 : !pto.ptr -> !pto.ptr - %old5 = pto.atomic_add %ptr5, %v5 l2cache(napw) signed : !pto.ptr, i32 -> i32 - %ptr6 = pto.addptr %arg0, %c6 : !pto.ptr -> !pto.ptr - %old6 = pto.atomic_add %ptr6, %v5 l2cache(napi) signed : !pto.ptr, i32 -> i32 - %ptr7 = pto.addptr %arg0, %c7 : !pto.ptr -> !pto.ptr - %old7 = pto.atomic_add %ptr7, %v5 l2cache(nared) signed : !pto.ptr, i32 -> i32 - %ptr8 = pto.addptr %arg0, %c8 : !pto.ptr -> !pto.ptr - %old8 = pto.atomic_add %ptr8, %v5 l2cache(wbhfv) signed : !pto.ptr, i32 -> i32 - %ptr9 = pto.addptr %arg0, %c9 : !pto.ptr -> !pto.ptr - %old9 = pto.atomic_add %ptr9, %v5 l2cache(wbhlv) signed : !pto.ptr, i32 -> i32 - %ptr10 = pto.addptr %arg0, %c10 : !pto.ptr -> !pto.ptr - %old10 = pto.atomic_add %ptr10, %v5 l2cache(wbhprs) signed : !pto.ptr, i32 -> i32 - %ptr11 = pto.addptr %arg0, %c11 : !pto.ptr -> !pto.ptr - %old11 = pto.atomic_add %ptr11, %v5 l2cache(wbhred) signed : !pto.ptr, i32 -> i32 - %ptr12 = pto.addptr %arg0, %c12 : !pto.ptr -> !pto.ptr - %old12 = pto.atomic_add %ptr12, %v5 l2cache(wtsfv) signed : !pto.ptr, i32 -> i32 - %ptr13 = pto.addptr %arg0, %c13 : !pto.ptr -> !pto.ptr - %old13 = pto.atomic_add %ptr13, %v5 l2cache(wtslv) signed : !pto.ptr, i32 -> i32 - %ptr14 = pto.addptr %arg0, %c14 : !pto.ptr -> !pto.ptr - %old14 = pto.atomic_add %ptr14, %v5 l2cache(wtsprs) signed : !pto.ptr, i32 -> i32 - %ptr15 = pto.addptr %arg0, %c15 : !pto.ptr -> !pto.ptr - %old15 = pto.atomic_add %ptr15, %v5 l2cache(wtsred) signed : !pto.ptr, i32 -> i32 + %old3 = pto.atomic_add %ptr3, %v5 l2cache(wtsred) signed : !pto.ptr, i32 -> i32 pto.store %old0, %arg0[%c16] : !pto.ptr, i32 pto.store %old1, %arg0[%c17] : !pto.ptr, i32 pto.store %old2, %arg0[%c18] : !pto.ptr, i32 pto.store %old3, %arg0[%c19] : !pto.ptr, i32 - pto.store %old4, %arg0[%c20] : !pto.ptr, i32 - pto.store %old5, %arg0[%c21] : !pto.ptr, i32 - pto.store %old6, %arg0[%c22] : !pto.ptr, i32 - pto.store %old7, %arg0[%c23] : !pto.ptr, i32 - pto.store %old8, %arg0[%c24] : !pto.ptr, i32 - pto.store %old9, %arg0[%c25] : !pto.ptr, i32 - pto.store %old10, %arg0[%c26] : !pto.ptr, i32 - pto.store %old11, %arg0[%c27] : !pto.ptr, i32 - pto.store %old12, %arg0[%c28] : !pto.ptr, i32 - pto.store %old13, %arg0[%c29] : !pto.ptr, i32 - pto.store %old14, %arg0[%c30] : !pto.ptr, i32 - pto.store %old15, %arg0[%c31] : !pto.ptr, i32 pto.barrier #pto.pipe return } diff --git a/test/vpto/cases/micro-op/simt/simt-atomic-mode-core/main.cpp b/test/vpto/cases/micro-op/simt/simt-atomic-mode-core/main.cpp index fa489cb57..4c196be25 100644 --- a/test/vpto/cases/micro-op/simt/simt-atomic-mode-core/main.cpp +++ b/test/vpto/cases/micro-op/simt/simt-atomic-mode-core/main.cpp @@ -17,7 +17,7 @@ using namespace PtoTestCommon; void LaunchSimt_atomic_mode_core_kernel(int *v1, void *stream); int main() { - size_t elemCount_v1 = 1024; + size_t elemCount_v1 = 32; size_t fileSize_v1 = elemCount_v1 * sizeof(int); int *v1Host = nullptr; int *v1Device = nullptr; diff --git a/test/vpto/cases/micro-op/simt/simt-ldst-policy-core/compare.py b/test/vpto/cases/micro-op/simt/simt-ldst-policy-core/compare.py index 9be085a8e..99a157ea3 100644 --- a/test/vpto/cases/micro-op/simt/simt-ldst-policy-core/compare.py +++ b/test/vpto/cases/micro-op/simt/simt-ldst-policy-core/compare.py @@ -13,52 +13,19 @@ import numpy as np -def compare_one(name: str, dtype) -> bool: - golden = np.fromfile(f"golden_{name}.bin", dtype=dtype) - out = np.fromfile(f"{name}.bin", dtype=dtype) +def main(): + strict = os.getenv("COMPARE_STRICT", "1") != "0" + golden = np.fromfile("golden_v1.bin", dtype=np.int32) + out = np.fromfile("v1.bin", dtype=np.int32) ok = golden.shape == out.shape and np.array_equal(golden, out) if not ok: idxs = np.nonzero(golden != out)[0] idx = int(idxs[0]) if idxs.size else 0 - if dtype in (np.int8, np.int16, np.int32, np.int64): - print( - f"[ERROR] {name} mismatch at idx={idx}, " - f"golden={int(golden[idx])}, out={int(out[idx])}" - ) - elif dtype == np.uint16: - print( - f"[ERROR] {name} mismatch at idx={idx}, " - f"golden=0x{int(golden[idx]):04x}, out=0x{int(out[idx]):04x}" - ) - elif dtype == np.uint8: - print( - f"[ERROR] {name} mismatch at idx={idx}, " - f"golden=0x{int(golden[idx]):02x}, out=0x{int(out[idx]):02x}" - ) - else: - print( - f"[ERROR] {name} mismatch at idx={idx}, " - f"golden={golden[idx]}, out={out[idx]}" - ) - return ok - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = ( - compare_one("v1", np.int32) - and compare_one("v2", np.uint16) - and compare_one("v3", np.uint16) - and compare_one("v4", np.int8) - and compare_one("v5", np.int16) - and compare_one("v6", np.int64) - and compare_one("v7", np.float32) - and compare_one("v8", np.float64) - and compare_one("v9", np.uint8) - and compare_one("v10", np.uint8) - ) - if not ok and strict: - sys.exit(2) + print( + f"[ERROR] mismatch at idx={idx}, golden={int(golden[idx])}, out={int(out[idx])}" + ) + if strict: + sys.exit(2) print("[INFO] compare passed" if ok else "[WARN] compare failed (non-gating)") diff --git a/test/vpto/cases/micro-op/simt/simt-ldst-policy-core/golden.py b/test/vpto/cases/micro-op/simt/simt-ldst-policy-core/golden.py index 90119b81d..c23e37c98 100644 --- a/test/vpto/cases/micro-op/simt/simt-ldst-policy-core/golden.py +++ b/test/vpto/cases/micro-op/simt/simt-ldst-policy-core/golden.py @@ -12,84 +12,25 @@ import numpy as np -ELEMS = 1024 +LANES = 2 +LANE_STRIDE = 8 +ELEMS = LANES * LANE_STRIDE def generate(output_dir: Path) -> None: output_dir.mkdir(parents=True, exist_ok=True) v1 = np.full(ELEMS, -1, dtype=np.int32) golden_v1 = np.full(ELEMS, -1, dtype=np.int32) - v2 = np.full(ELEMS, 0xBC00, dtype=np.uint16) - v3 = np.full(ELEMS, 0xBF80, dtype=np.uint16) - v4 = np.full(ELEMS, -1, dtype=np.int8) - v5 = np.full(ELEMS, -1, dtype=np.int16) - v6 = np.full(ELEMS, -1, dtype=np.int64) - v7 = np.full(ELEMS, -1.0, dtype=np.float32) - v8 = np.full(ELEMS, -1.0, dtype=np.float64) - v9 = np.full(ELEMS, 0xFF, dtype=np.uint8) - v10 = np.full(ELEMS, 0xFF, dtype=np.uint8) - golden_v2 = v2.copy() - golden_v3 = v3.copy() - golden_v4 = v4.copy() - golden_v5 = v5.copy() - golden_v6 = v6.copy() - golden_v7 = v7.copy() - golden_v8 = v8.copy() - golden_v9 = v9.copy() - golden_v10 = v10.copy() - inputs = np.array([0x10203040, -1234567], dtype=np.int32) - v1[:2] = inputs - golden_v1[:2] = inputs - golden_v1[2] = inputs[0] - golden_v1[3] = inputs[1] - golden_v1[4] = np.int32(inputs[0] + inputs[1]) - v2[:2] = np.array([0x3E00, 0xC000], dtype=np.uint16) # f16: 1.5, -2.0 - v3[:2] = np.array([0x3FC0, 0xC000], dtype=np.uint16) # bf16: 1.5, -2.0 - golden_v2[:2] = v2[:2] - golden_v2[2:4] = v2[:2] - golden_v3[:2] = v3[:2] - golden_v3[2:4] = v3[:2] - v4[:2] = np.array([0x12, -0x34], dtype=np.int8) - v5[:2] = np.array([0x1234, -0x3456], dtype=np.int16) - v6[:2] = np.array([0x1020304050607080, -0x102030405060708], dtype=np.int64) - v7[:2] = np.array([2.5, -3.5], dtype=np.float32) - v8[:2] = np.array([4.5, -5.5], dtype=np.float64) - v9[:2] = np.array([0x38, 0xB8], dtype=np.uint8) - v10[:2] = np.array([0x10, 0x08], dtype=np.uint8) - golden_v4[:2] = v4[:2] - golden_v4[2:4] = v4[:2] - golden_v5[:2] = v5[:2] - golden_v5[2:4] = v5[:2] - golden_v6[:2] = v6[:2] - golden_v6[2:4] = v6[:2] - golden_v7[:2] = v7[:2] - golden_v7[2:4] = v7[:2] - golden_v8[:2] = v8[:2] - golden_v8[2:4] = v8[:2] - golden_v9[:2] = v9[:2] - golden_v9[2:4] = v9[:2] - golden_v10[:2] = v10[:2] - golden_v10[2:4] = v10[:2] + for lane in range(LANES): + base = lane * LANE_STRIDE + inputs = np.array([0x10203040 + lane, -1234567 - lane], dtype=np.int32) + v1[base : base + 2] = inputs + golden_v1[base : base + 2] = inputs + golden_v1[base + 2] = inputs[0] + golden_v1[base + 3] = inputs[1] + golden_v1[base + 4] = np.int32(inputs[0] + inputs[1]) v1.tofile(output_dir / "v1.bin") - v2.tofile(output_dir / "v2.bin") - v3.tofile(output_dir / "v3.bin") - v4.tofile(output_dir / "v4.bin") - v5.tofile(output_dir / "v5.bin") - v6.tofile(output_dir / "v6.bin") - v7.tofile(output_dir / "v7.bin") - v8.tofile(output_dir / "v8.bin") - v9.tofile(output_dir / "v9.bin") - v10.tofile(output_dir / "v10.bin") golden_v1.tofile(output_dir / "golden_v1.bin") - golden_v2.tofile(output_dir / "golden_v2.bin") - golden_v3.tofile(output_dir / "golden_v3.bin") - golden_v4.tofile(output_dir / "golden_v4.bin") - golden_v5.tofile(output_dir / "golden_v5.bin") - golden_v6.tofile(output_dir / "golden_v6.bin") - golden_v7.tofile(output_dir / "golden_v7.bin") - golden_v8.tofile(output_dir / "golden_v8.bin") - golden_v9.tofile(output_dir / "golden_v9.bin") - golden_v10.tofile(output_dir / "golden_v10.bin") def main(): diff --git a/test/vpto/cases/micro-op/simt/simt-ldst-policy-core/kernel.pto b/test/vpto/cases/micro-op/simt/simt-ldst-policy-core/kernel.pto index 870675dc8..2971773c2 100644 --- a/test/vpto/cases/micro-op/simt/simt-ldst-policy-core/kernel.pto +++ b/test/vpto/cases/micro-op/simt/simt-ldst-policy-core/kernel.pto @@ -1,85 +1,35 @@ module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @simt_ldst_policy_core_kernel(%arg0: !pto.ptr, - %arg1: !pto.ptr, - %arg2: !pto.ptr, - %arg3: !pto.ptr, - %arg4: !pto.ptr, - %arg5: !pto.ptr, - %arg6: !pto.ptr, - %arg7: !pto.ptr, - %arg8: !pto.ptr, - %arg9: !pto.ptr) attributes {pto.aicore} { + func.func @simt_ldst_policy_core_kernel(%arg0: !pto.ptr) attributes {pto.aicore} { %dim_z = arith.constant 1 : i32 %dim_y = arith.constant 1 : i32 - %dim_x = arith.constant 32 : i32 + %dim_x = arith.constant 2 : i32 - pto.simt_launch @simt_ldst_policy_core_body<<<%dim_x, %dim_y, %dim_z>>>(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9) - : (!pto.ptr, !pto.ptr, !pto.ptr, !pto.ptr, !pto.ptr, !pto.ptr, !pto.ptr, !pto.ptr, !pto.ptr, !pto.ptr) -> () + pto.simt_launch @simt_ldst_policy_core_body<<<%dim_x, %dim_y, %dim_z>>>(%arg0) + : (!pto.ptr) -> () pto.barrier #pto.pipe return } - func.func @simt_ldst_policy_core_body(%arg0: !pto.ptr, - %arg1: !pto.ptr, - %arg2: !pto.ptr, - %arg3: !pto.ptr, - %arg4: !pto.ptr, - %arg5: !pto.ptr, - %arg6: !pto.ptr, - %arg7: !pto.ptr, - %arg8: !pto.ptr, - %arg9: !pto.ptr) attributes {pto.simt_entry} { - %c0 = arith.constant 0 : index + func.func @simt_ldst_policy_core_body(%arg0: !pto.ptr) attributes {pto.simt_entry} { %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %c3 = arith.constant 3 : index %c4 = arith.constant 4 : index + %c8_i32 = arith.constant 8 : i32 + %laneid = pto.get_laneid : i32 + %base_i32 = arith.muli %laneid, %c8_i32 : i32 + %idx0 = arith.index_castui %base_i32 : i32 to index + %idx1 = arith.addi %idx0, %c1 : index + %idx2 = arith.addi %idx0, %c2 : index + %idx3 = arith.addi %idx0, %c3 : index + %idx4 = arith.addi %idx0, %c4 : index - %v_cache = pto.ldg %arg0[%c0] l1cache(cache) l2cache(nmfv) : !pto.ptr -> i32 - %v_uncache = pto.ldg %arg0[%c1] l1cache(uncache) l2cache(nmfv) : !pto.ptr -> i32 - + %v_cache = pto.ldg %arg0[%idx0] l1cache(cache) l2cache(nmfv) : !pto.ptr -> i32 + %v_uncache = pto.ldg %arg0[%idx1] l1cache(uncache) l2cache(nmfv) : !pto.ptr -> i32 %sum = arith.addi %v_cache, %v_uncache : i32 - pto.stg %v_cache, %arg0[%c2] l1cache(cache) l2cache(nmfv) : !pto.ptr, i32 - pto.stg %v_uncache, %arg0[%c3] l1cache(uncache) l2cache(nmfv) : !pto.ptr, i32 - pto.stg %sum, %arg0[%c4] l1cache(cache) l2cache(nmfv) : !pto.ptr, i32 - - %h_cache = pto.ldg %arg1[%c0] l1cache(cache) l2cache(nmfv) : !pto.ptr -> f16 - %h_uncache = pto.ldg %arg1[%c1] l1cache(uncache) l2cache(nmfv) : !pto.ptr -> f16 - %b_cache = pto.ldg %arg2[%c0] l1cache(cache) l2cache(nmfv) : !pto.ptr -> bf16 - %b_uncache = pto.ldg %arg2[%c1] l1cache(uncache) l2cache(nmfv) : !pto.ptr -> bf16 - pto.stg %h_cache, %arg1[%c2] l1cache(cache) l2cache(nmfv) : !pto.ptr, f16 - pto.stg %h_uncache, %arg1[%c3] l1cache(uncache) l2cache(nmfv) : !pto.ptr, f16 - pto.stg %b_cache, %arg2[%c2] l1cache(cache) l2cache(nmfv) : !pto.ptr, bf16 - pto.stg %b_uncache, %arg2[%c3] l1cache(uncache) l2cache(nmfv) : !pto.ptr, bf16 - - %i8_cache = pto.ldg %arg3[%c0] l1cache(cache) l2cache(nmfv) : !pto.ptr -> i8 - %i8_uncache = pto.ldg %arg3[%c1] l1cache(uncache) l2cache(nmfv) : !pto.ptr -> i8 - %i16_cache = pto.ldg %arg4[%c0] l1cache(cache) l2cache(nmfv) : !pto.ptr -> i16 - %i16_uncache = pto.ldg %arg4[%c1] l1cache(uncache) l2cache(nmfv) : !pto.ptr -> i16 - %i64_cache = pto.ldg %arg5[%c0] l1cache(cache) l2cache(nmfv) : !pto.ptr -> i64 - %i64_uncache = pto.ldg %arg5[%c1] l1cache(uncache) l2cache(nmfv) : !pto.ptr -> i64 - %f32_cache = pto.ldg %arg6[%c0] l1cache(cache) l2cache(nmfv) : !pto.ptr -> f32 - %f32_uncache = pto.ldg %arg6[%c1] l1cache(uncache) l2cache(nmfv) : !pto.ptr -> f32 - %f64_cache = pto.ldg %arg7[%c0] l1cache(cache) l2cache(nmfv) : !pto.ptr -> f64 - %f64_uncache = pto.ldg %arg7[%c1] l1cache(uncache) l2cache(nmfv) : !pto.ptr -> f64 - %f8e4_cache = pto.ldg %arg8[%c0] l1cache(cache) l2cache(nmfv) : !pto.ptr -> f8E4M3FN - %f8e4_uncache = pto.ldg %arg8[%c1] l1cache(uncache) l2cache(nmfv) : !pto.ptr -> f8E4M3FN - %hif8_cache = pto.ldg %arg9[%c0] l1cache(cache) l2cache(nmfv) : !pto.ptr -> !pto.hif8 - %hif8_uncache = pto.ldg %arg9[%c1] l1cache(uncache) l2cache(nmfv) : !pto.ptr -> !pto.hif8 - pto.stg %i8_cache, %arg3[%c2] l1cache(cache) l2cache(nmfv) : !pto.ptr, i8 - pto.stg %i8_uncache, %arg3[%c3] l1cache(uncache) l2cache(nmfv) : !pto.ptr, i8 - pto.stg %i16_cache, %arg4[%c2] l1cache(cache) l2cache(nmfv) : !pto.ptr, i16 - pto.stg %i16_uncache, %arg4[%c3] l1cache(uncache) l2cache(nmfv) : !pto.ptr, i16 - pto.stg %i64_cache, %arg5[%c2] l1cache(cache) l2cache(nmfv) : !pto.ptr, i64 - pto.stg %i64_uncache, %arg5[%c3] l1cache(uncache) l2cache(nmfv) : !pto.ptr, i64 - pto.stg %f32_cache, %arg6[%c2] l1cache(cache) l2cache(nmfv) : !pto.ptr, f32 - pto.stg %f32_uncache, %arg6[%c3] l1cache(uncache) l2cache(nmfv) : !pto.ptr, f32 - pto.stg %f64_cache, %arg7[%c2] l1cache(cache) l2cache(nmfv) : !pto.ptr, f64 - pto.stg %f64_uncache, %arg7[%c3] l1cache(uncache) l2cache(nmfv) : !pto.ptr, f64 - pto.stg %f8e4_cache, %arg8[%c2] l1cache(cache) l2cache(nmfv) : !pto.ptr, f8E4M3FN - pto.stg %f8e4_uncache, %arg8[%c3] l1cache(uncache) l2cache(nmfv) : !pto.ptr, f8E4M3FN - pto.stg %hif8_cache, %arg9[%c2] l1cache(cache) l2cache(nmfv) : !pto.ptr, !pto.hif8 - pto.stg %hif8_uncache, %arg9[%c3] l1cache(uncache) l2cache(nmfv) : !pto.ptr, !pto.hif8 + pto.stg %v_cache, %arg0[%idx2] l1cache(cache) l2cache(nmfv) : !pto.ptr, i32 + pto.stg %v_uncache, %arg0[%idx3] l1cache(uncache) l2cache(nmfv) : !pto.ptr, i32 + pto.stg %sum, %arg0[%idx4] l1cache(cache) l2cache(nmfv) : !pto.ptr, i32 return } } diff --git a/test/vpto/cases/micro-op/simt/simt-ldst-policy-core/launch.cpp b/test/vpto/cases/micro-op/simt/simt-ldst-policy-core/launch.cpp index 14ac3026d..8f4347c19 100644 --- a/test/vpto/cases/micro-op/simt/simt-ldst-policy-core/launch.cpp +++ b/test/vpto/cases/micro-op/simt/simt-ldst-policy-core/launch.cpp @@ -8,41 +8,11 @@ #ifndef __VEC_SCOPE__ #define __VEC_SCOPE__ #endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif #include - -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif #ifndef __CPU_SIM #include "acl/acl.h" #endif -extern "C" __global__ [aicore] void simt_ldst_policy_core_kernel( - __gm__ int *v1, __gm__ half *v2, __gm__ bfloat16_t *v3, - __gm__ int8_t *v4, __gm__ int16_t *v5, __gm__ int64_t *v6, - __gm__ float *v7, __gm__ double *v8, __gm__ uint8_t *v9, - __gm__ uint8_t *v10); -void LaunchSimt_ldst_policy_core_kernel(int *v1, uint16_t *v2, uint16_t *v3, - int8_t *v4, int16_t *v5, int64_t *v6, - float *v7, double *v8, uint8_t *v9, - uint8_t *v10, - void *stream) { - simt_ldst_policy_core_kernel<<<1, nullptr, stream>>>( - (__gm__ int *)v1, (__gm__ half *)v2, (__gm__ bfloat16_t *)v3, - (__gm__ int8_t *)v4, (__gm__ int16_t *)v5, (__gm__ int64_t *)v6, - (__gm__ float *)v7, (__gm__ double *)v8, (__gm__ uint8_t *)v9, - (__gm__ uint8_t *)v10); +extern "C" __global__ [aicore] void simt_ldst_policy_core_kernel(__gm__ int *v1); +void LaunchSimt_ldst_policy_core_kernel(int *v1, void *stream) { + simt_ldst_policy_core_kernel<<<1, nullptr, stream>>>((__gm__ int *)v1); } diff --git a/test/vpto/cases/micro-op/simt/simt-ldst-policy-core/main.cpp b/test/vpto/cases/micro-op/simt/simt-ldst-policy-core/main.cpp index db16dfa11..667869438 100644 --- a/test/vpto/cases/micro-op/simt/simt-ldst-policy-core/main.cpp +++ b/test/vpto/cases/micro-op/simt/simt-ldst-policy-core/main.cpp @@ -9,49 +9,18 @@ #include "acl/acl.h" #include #include -#include using namespace PtoTestCommon; #define ACL_CHECK(expr) do { const aclError _ret = (expr); if (_ret != ACL_SUCCESS) { std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, (int)_ret, __FILE__, __LINE__); rc = 1; goto cleanup; } } while (0) -void LaunchSimt_ldst_policy_core_kernel(int *v1, uint16_t *v2, uint16_t *v3, - int8_t *v4, int16_t *v5, int64_t *v6, - float *v7, double *v8, uint8_t *v9, - uint8_t *v10, - void *stream); +void LaunchSimt_ldst_policy_core_kernel(int *v1, void *stream); int main() { - size_t elemCount_v1 = 1024; + size_t elemCount_v1 = 16; size_t fileSize_v1 = elemCount_v1 * sizeof(int); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(uint16_t); - size_t fileSize_v4 = elemCount_v2 * sizeof(int8_t); - size_t fileSize_v5 = elemCount_v2 * sizeof(int16_t); - size_t fileSize_v6 = elemCount_v2 * sizeof(int64_t); - size_t fileSize_v7 = elemCount_v2 * sizeof(float); - size_t fileSize_v8 = elemCount_v2 * sizeof(double); - size_t fileSize_v9 = elemCount_v2 * sizeof(uint8_t); int *v1Host = nullptr; - uint16_t *v2Host = nullptr; - uint16_t *v3Host = nullptr; - int8_t *v4Host = nullptr; - int16_t *v5Host = nullptr; - int64_t *v6Host = nullptr; - float *v7Host = nullptr; - double *v8Host = nullptr; - uint8_t *v9Host = nullptr; - uint8_t *v10Host = nullptr; int *v1Device = nullptr; - uint16_t *v2Device = nullptr; - uint16_t *v3Device = nullptr; - int8_t *v4Device = nullptr; - int16_t *v5Device = nullptr; - int64_t *v6Device = nullptr; - float *v7Device = nullptr; - double *v8Device = nullptr; - uint8_t *v9Device = nullptr; - uint8_t *v10Device = nullptr; int rc = 0; bool aclInited = false; bool deviceSet = false; @@ -66,91 +35,16 @@ int main() { deviceSet = true; ACL_CHECK(aclrtCreateStream(&stream)); ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMallocHost((void **)(&v3Host), fileSize_v2)); - ACL_CHECK(aclrtMallocHost((void **)(&v4Host), fileSize_v4)); - ACL_CHECK(aclrtMallocHost((void **)(&v5Host), fileSize_v5)); - ACL_CHECK(aclrtMallocHost((void **)(&v6Host), fileSize_v6)); - ACL_CHECK(aclrtMallocHost((void **)(&v7Host), fileSize_v7)); - ACL_CHECK(aclrtMallocHost((void **)(&v8Host), fileSize_v8)); - ACL_CHECK(aclrtMallocHost((void **)(&v9Host), fileSize_v9)); - ACL_CHECK(aclrtMallocHost((void **)(&v10Host), fileSize_v9)); ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v3Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v4Device, fileSize_v4, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v5Device, fileSize_v5, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v6Device, fileSize_v6, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v7Device, fileSize_v7, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v8Device, fileSize_v8, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v9Device, fileSize_v9, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v10Device, fileSize_v9, ACL_MEM_MALLOC_HUGE_FIRST)); ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ReadFile("./v3.bin", fileSize_v2, v3Host, fileSize_v2); - ReadFile("./v4.bin", fileSize_v4, v4Host, fileSize_v4); - ReadFile("./v5.bin", fileSize_v5, v5Host, fileSize_v5); - ReadFile("./v6.bin", fileSize_v6, v6Host, fileSize_v6); - ReadFile("./v7.bin", fileSize_v7, v7Host, fileSize_v7); - ReadFile("./v8.bin", fileSize_v8, v8Host, fileSize_v8); - ReadFile("./v9.bin", fileSize_v9, v9Host, fileSize_v9); - ReadFile("./v10.bin", fileSize_v9, v10Host, fileSize_v9); ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v3Device, fileSize_v2, v3Host, fileSize_v2, ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v4Device, fileSize_v4, v4Host, fileSize_v4, ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v5Device, fileSize_v5, v5Host, fileSize_v5, ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v6Device, fileSize_v6, v6Host, fileSize_v6, ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v7Device, fileSize_v7, v7Host, fileSize_v7, ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v8Device, fileSize_v8, v8Host, fileSize_v8, ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v9Device, fileSize_v9, v9Host, fileSize_v9, ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v10Device, fileSize_v9, v10Host, fileSize_v9, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchSimt_ldst_policy_core_kernel(v1Device, v2Device, v3Device, v4Device, - v5Device, v6Device, v7Device, v8Device, - v9Device, v10Device, - stream); + LaunchSimt_ldst_policy_core_kernel(v1Device, stream); ACL_CHECK(aclrtSynchronizeStream(stream)); ACL_CHECK(aclrtMemcpy(v1Host, fileSize_v1, v1Device, fileSize_v1, ACL_MEMCPY_DEVICE_TO_HOST)); - ACL_CHECK(aclrtMemcpy(v2Host, fileSize_v2, v2Device, fileSize_v2, ACL_MEMCPY_DEVICE_TO_HOST)); - ACL_CHECK(aclrtMemcpy(v3Host, fileSize_v2, v3Device, fileSize_v2, ACL_MEMCPY_DEVICE_TO_HOST)); - ACL_CHECK(aclrtMemcpy(v4Host, fileSize_v4, v4Device, fileSize_v4, ACL_MEMCPY_DEVICE_TO_HOST)); - ACL_CHECK(aclrtMemcpy(v5Host, fileSize_v5, v5Device, fileSize_v5, ACL_MEMCPY_DEVICE_TO_HOST)); - ACL_CHECK(aclrtMemcpy(v6Host, fileSize_v6, v6Device, fileSize_v6, ACL_MEMCPY_DEVICE_TO_HOST)); - ACL_CHECK(aclrtMemcpy(v7Host, fileSize_v7, v7Device, fileSize_v7, ACL_MEMCPY_DEVICE_TO_HOST)); - ACL_CHECK(aclrtMemcpy(v8Host, fileSize_v8, v8Device, fileSize_v8, ACL_MEMCPY_DEVICE_TO_HOST)); - ACL_CHECK(aclrtMemcpy(v9Host, fileSize_v9, v9Device, fileSize_v9, ACL_MEMCPY_DEVICE_TO_HOST)); - ACL_CHECK(aclrtMemcpy(v10Host, fileSize_v9, v10Device, fileSize_v9, ACL_MEMCPY_DEVICE_TO_HOST)); WriteFile("./v1.bin", v1Host, fileSize_v1); - WriteFile("./v2.bin", v2Host, fileSize_v2); - WriteFile("./v3.bin", v3Host, fileSize_v2); - WriteFile("./v4.bin", v4Host, fileSize_v4); - WriteFile("./v5.bin", v5Host, fileSize_v5); - WriteFile("./v6.bin", v6Host, fileSize_v6); - WriteFile("./v7.bin", v7Host, fileSize_v7); - WriteFile("./v8.bin", v8Host, fileSize_v8); - WriteFile("./v9.bin", v9Host, fileSize_v9); - WriteFile("./v10.bin", v10Host, fileSize_v9); cleanup: - aclrtFree(v10Device); - aclrtFree(v9Device); - aclrtFree(v8Device); - aclrtFree(v7Device); - aclrtFree(v6Device); - aclrtFree(v5Device); - aclrtFree(v4Device); - aclrtFree(v3Device); - aclrtFree(v2Device); aclrtFree(v1Device); - aclrtFreeHost(v8Host); - aclrtFreeHost(v10Host); - aclrtFreeHost(v9Host); - aclrtFreeHost(v7Host); - aclrtFreeHost(v6Host); - aclrtFreeHost(v5Host); - aclrtFreeHost(v4Host); - aclrtFreeHost(v3Host); - aclrtFreeHost(v2Host); aclrtFreeHost(v1Host); if (stream) aclrtDestroyStream(stream); diff --git a/test/vpto/cases/micro-op/unary-vector/vabs-f16/compare.py b/test/vpto/cases/micro-op/unary-vector/vabs-f16/compare.py deleted file mode 100755 index 77d269686..000000000 --- a/test/vpto/cases/micro-op/unary-vector/vabs-f16/compare.py +++ /dev/null @@ -1,209 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/unary-vector/vabs-f16 -# family: unary-vector -# target_ops: pto.vabs -# scenarios: core-f16, full-mask -# NOTE: bulk-generated coverage skeleton. -# coding=utf-8 - -import os -import sys -import numpy as np - - -REPEAT_BYTES = 256 - - -def _ceil_div(x, y): - return (x + y - 1) // y - - -def _packed_pred_storage_bytes(logical_elems, src_elem_bytes): - logical_elems = int(logical_elems) - src_elem_bytes = int(src_elem_bytes) - if logical_elems <= 0: - raise ValueError(f"logical_elems must be > 0, got {logical_elems}") - if src_elem_bytes not in (1, 2, 4): - raise ValueError(f"unsupported packed predicate source size: {src_elem_bytes}") - - repeat_elems = REPEAT_BYTES // src_elem_bytes - if src_elem_bytes == 4: - repeat_times = _ceil_div(logical_elems, repeat_elems) + 1 - loop_count = repeat_times // 2 - return loop_count * 16 - - repeat_times = _ceil_div(logical_elems, repeat_elems) - return repeat_times * (repeat_elems // 8) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - g = golden.astype(np.int64, copy=False) - o = output.astype(np.int64, copy=False) - else: - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - abs_diff = np.abs(g - o) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={g[idx]}, out={o[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_bin_prefix(golden_path, output_path, dtype, eps, count): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - try: - count = int(count) - except Exception: - print(f"[ERROR] Invalid prefix count: {count}") - return False - if count <= 0: - print(f"[ERROR] Invalid prefix count: {count}") - return False - - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np, count=count) - output = np.fromfile(output_path, dtype=dtype_np, count=count) - - if golden.size != count or output.size != count: - print( - f"[ERROR] Prefix read too small: need={count} elems, " - f"golden={golden.size}, out={output.size}" - ) - return False - - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - g = golden.astype(np.int64, copy=False) - o = output.astype(np.int64, copy=False) - else: - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - abs_diff = np.abs(g - o) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch (prefix): {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={g[idx]}, out={o[idx]}, dtype={dtype_np}, count={count})" - ) - else: - print(f"[ERROR] Mismatch (prefix): {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, logical_elems, src_elem_bytes): - """ - Compare outputs of pto.tcmp / pto.tcmps. - - PTO-ISA stores packed predicate results as a linear PK byte stream via - `psts`, with the exact written prefix length determined by the typed - TCMP/TCMPS repeat schedule. Compare only that semantic prefix. - """ - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - try: - logical_elems = int(logical_elems) - src_elem_bytes = int(src_elem_bytes) - except Exception: - print( - "[ERROR] Invalid packed mask compare arguments: " - f"logical_elems={logical_elems} src_elem_bytes={src_elem_bytes}" - ) - return False - if logical_elems <= 0 or src_elem_bytes <= 0: - print( - "[ERROR] Invalid packed mask compare arguments: " - f"logical_elems={logical_elems} src_elem_bytes={src_elem_bytes}" - ) - return False - - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - try: - prefix_bytes = _packed_pred_storage_bytes(logical_elems, src_elem_bytes) - except ValueError as exc: - print(f"[ERROR] {exc}") - return False - - if golden.size < prefix_bytes or output.size < prefix_bytes: - print( - f"[ERROR] Packed mask buffer too small: need={prefix_bytes} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - - golden_sel = golden[:prefix_bytes] - output_sel = output[:prefix_bytes] - - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, " - f"logical_elems={logical_elems}, src_elem_bytes={src_elem_bytes}, prefix_bytes={prefix_bytes})" - ) - return False - return True - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = True - ok = compare_bin("golden_v2.bin", "v2.bin", np.float32, 0.0001) and ok - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/unary-vector/vabs-f16/golden.py b/test/vpto/cases/micro-op/unary-vector/vabs-f16/golden.py deleted file mode 100755 index b90b097ce..000000000 --- a/test/vpto/cases/micro-op/unary-vector/vabs-f16/golden.py +++ /dev/null @@ -1,51 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/unary-vector/vabs-f16 -# family: unary-vector -# target_ops: pto.vabs -# scenarios: core-f16, full-mask -# NOTE: bulk-generated coverage skeleton. -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -ROWS = 32 -COLS = 32 -SEED = 19 - - -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) - v1 = rng.uniform(-8.0, 8.0, size=(ROWS, COLS)).astype(np.float32) - v2 = np.zeros((ROWS, COLS), dtype=np.float32) - golden_v2 = np.abs(v1).astype(np.float32, copy=False) - - output_dir.mkdir(parents=True, exist_ok=True) - v1.reshape(-1).tofile(output_dir / "v1.bin") - v2.reshape(-1).tofile(output_dir / "v2.bin") - golden_v2.reshape(-1).tofile(output_dir / "golden_v2.bin") - - -def main() -> None: - parser = argparse.ArgumentParser( - description="Generate numpy-based inputs/golden for VPTO micro-op vabs validation." - ) - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/unary-vector/vabs-f16/kernel.pto b/test/vpto/cases/micro-op/unary-vector/vabs-f16/kernel.pto deleted file mode 100644 index ae2af0a2e..000000000 --- a/test/vpto/cases/micro-op/unary-vector/vabs-f16/kernel.pto +++ /dev/null @@ -1,70 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/unary-vector/vabs-f16 -// family: unary-vector -// target_ops: pto.vabs -// scenarios: core-f16, full-mask -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -// ============================================================================= -// abs_kernel_2d: Element-wise absolute value on a 32x32 f32 tile -// ============================================================================= -// This kernel computes abs(input) for a 32x32 float32 matrix (1024 elements). -// -// Memory Layout: -// - Input: arg0 -> GM (Global Memory) -// - Output: arg1 -> GM (Global Memory) -// - UB (Unified Buffer) at offset 0: input tile (4096 bytes = 32*32*4) -// - UB at offset 4096: output tile (4096 bytes = 32*32*4) -// -// Pipeline: -// 1. DMA: GM -> UB (MTE2 pipe) - copy input tile to UB -// 2. Sync: wait for MTE2 -> V pipe handoff -// 3. Compute: vabs on 64-element vectors (V pipe) - 16 iterations for 1024 elements -// 4. Sync: wait for V -> MTE3 pipe handoff -// 5. DMA: UB -> GM (MTE3 pipe) - copy result tile back to GM -// ============================================================================= - -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vabs_kernel_2d(%arg0: !pto.ptr, %arg1: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c64 = arith.constant 64 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c128_i64 = arith.constant 128 : i64 - %c4096_i64 = arith.constant 4096 : i64 - %c1024_i32 = arith.constant 1024 : i32 - - %ub_in = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c4096_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_in, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %_:1 = scf.for %offset = %c0 to %c1024 step %c64 iter_args(%remaining = %c1024_i32) -> (i32) { - %mask, %next_remaining = pto.plt_b32 %remaining : i32 -> !pto.mask, i32 - %vec = pto.vlds %ub_in[%offset] : !pto.ptr -> !pto.vreg<64xf32> - %out = pto.vabs %vec, %mask : !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> - pto.vsts %out, %ub_out[%offset], %mask : !pto.vreg<64xf32>, !pto.ptr, !pto.mask - scf.yield %next_remaining : i32 - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg1, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/unary-vector/vabs-f16/launch.cpp b/test/vpto/cases/micro-op/unary-vector/vabs-f16/launch.cpp deleted file mode 100644 index 58cdd948a..000000000 --- a/test/vpto/cases/micro-op/unary-vector/vabs-f16/launch.cpp +++ /dev/null @@ -1,70 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/unary-vector/vabs-f16 -// family: unary-vector -// target_ops: pto.vabs -// scenarios: core-f16, full-mask -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -// --------------------------------------------------------------------------- -// PTOAS compatibility layer -// -// The upstream pto-isa headers reference some FP8/FP4 types and the -// __VEC_SCOPE__ marker that are not available on every AICore arch/toolchain -// combination (e.g. __NPU_ARCH__==2201). -// -// For our PTOAS-generated kernels we don't rely on these types today, but the -// headers still mention them in templates/static_asserts. Provide minimal -// fallbacks to keep compilation working on dav-c220. -// --------------------------------------------------------------------------- -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include - -// AICore printf support is gated behind `--cce-enable-print` on some -// toolchains. When enabled, include the CCE print header so `cce::printf` -// resolves in device compilation. -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -// Some PTO-ISA types are only available in the __CCE_AICORE__ compilation -// path, but `bisheng -xcce` still performs a host-side parse pass. -// Provide minimal fallbacks only when the corresponding header wasn't -// pulled in by the selected arch implementation. -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vabs_kernel_2d(__gm__ float *v1, - __gm__ float *v2); - -void LaunchVabs_kernel_2d(float *v1, float *v2, void *stream) { - vabs_kernel_2d<<<1, nullptr, stream>>>((__gm__ float *)v1, (__gm__ float *)v2); -} diff --git a/test/vpto/cases/micro-op/unary-vector/vabs-f16/main.cpp b/test/vpto/cases/micro-op/unary-vector/vabs-f16/main.cpp deleted file mode 100644 index 76001407d..000000000 --- a/test/vpto/cases/micro-op/unary-vector/vabs-f16/main.cpp +++ /dev/null @@ -1,130 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/unary-vector/vabs-f16 -// family: unary-vector -// target_ops: pto.vabs -// scenarios: core-f16, full-mask -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -This program is free software, you can redistribute it and/or modify it under the terms and conditions of -CANN Open Software License Agreement Version 2.0 (the "License"). -Please refer to the License for details. You may not use this file except in compliance with the License. -THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -See LICENSE in the root of the software repository for the full text of the License. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include -#include - -using namespace PtoTestCommon; - -#ifndef TMRGSORT_HPP -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, (int)_ret, __FILE__, __LINE__); \ - const char *_recent = aclGetRecentErrMsg(); \ - if (_recent != nullptr && _recent[0] != '\0') { \ - std::fprintf(stderr, "[ERROR] RecentErrMsg: %s\n", _recent); \ - } \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVabs_kernel_2d(float *v1, float *v2, void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(float); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(float); - float *v1Host = nullptr; - float *v1Device = nullptr; - float *v2Host = nullptr; - float *v2Device = nullptr; - - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { - deviceId = std::atoi(envDevice); - } - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVabs_kernel_2d(v1Device, v2Device, stream); - - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v2Host, fileSize_v2, v2Device, fileSize_v2, ACL_MEMCPY_DEVICE_TO_HOST)); - - WriteFile("./v2.bin", v2Host, fileSize_v2); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - if (stream != nullptr) { - const aclError _ret = aclrtDestroyStream(stream); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtDestroyStream(stream)", (int)_ret, __FILE__, __LINE__); - } - stream = nullptr; - } - if (deviceSet) { - const aclError _ret = aclrtResetDevice(deviceId); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtResetDevice(deviceId)", (int)_ret, __FILE__, __LINE__); - } - } - if (aclInited) { - const aclError _ret = aclFinalize(); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclFinalize()", (int)_ret, __FILE__, __LINE__); - } - } - - return rc; -} diff --git a/test/vpto/cases/micro-op/unary-vector/vabs-f32-exceptional/compare.py b/test/vpto/cases/micro-op/unary-vector/vabs-f32-exceptional/compare.py deleted file mode 100644 index 962985a24..000000000 --- a/test/vpto/cases/micro-op/unary-vector/vabs-f32-exceptional/compare.py +++ /dev/null @@ -1,204 +0,0 @@ -#!/usr/bin/python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# coding=utf-8 - -import os -import sys -import numpy as np - - -REPEAT_BYTES = 256 - - -def _ceil_div(x, y): - return (x + y - 1) // y - - -def _packed_pred_storage_bytes(logical_elems, src_elem_bytes): - logical_elems = int(logical_elems) - src_elem_bytes = int(src_elem_bytes) - if logical_elems <= 0: - raise ValueError(f"logical_elems must be > 0, got {logical_elems}") - if src_elem_bytes not in (1, 2, 4): - raise ValueError(f"unsupported packed predicate source size: {src_elem_bytes}") - - repeat_elems = REPEAT_BYTES // src_elem_bytes - if src_elem_bytes == 4: - repeat_times = _ceil_div(logical_elems, repeat_elems) + 1 - loop_count = repeat_times // 2 - return loop_count * 16 - - repeat_times = _ceil_div(logical_elems, repeat_elems) - return repeat_times * (repeat_elems // 8) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - g = golden.astype(np.int64, copy=False) - o = output.astype(np.int64, copy=False) - else: - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - abs_diff = np.abs(g - o) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={g[idx]}, out={o[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_bin_prefix(golden_path, output_path, dtype, eps, count): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - try: - count = int(count) - except Exception: - print(f"[ERROR] Invalid prefix count: {count}") - return False - if count <= 0: - print(f"[ERROR] Invalid prefix count: {count}") - return False - - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np, count=count) - output = np.fromfile(output_path, dtype=dtype_np, count=count) - - if golden.size != count or output.size != count: - print( - f"[ERROR] Prefix read too small: need={count} elems, " - f"golden={golden.size}, out={output.size}" - ) - return False - - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - g = golden.astype(np.int64, copy=False) - o = output.astype(np.int64, copy=False) - else: - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - abs_diff = np.abs(g - o) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch (prefix): {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={g[idx]}, out={o[idx]}, dtype={dtype_np}, count={count})" - ) - else: - print(f"[ERROR] Mismatch (prefix): {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, logical_elems, src_elem_bytes): - """ - Compare outputs of pto.tcmp / pto.tcmps. - - PTO-ISA stores packed predicate results as a linear PK byte stream via - `psts`, with the exact written prefix length determined by the typed - TCMP/TCMPS repeat schedule. Compare only that semantic prefix. - """ - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - try: - logical_elems = int(logical_elems) - src_elem_bytes = int(src_elem_bytes) - except Exception: - print( - "[ERROR] Invalid packed mask compare arguments: " - f"logical_elems={logical_elems} src_elem_bytes={src_elem_bytes}" - ) - return False - if logical_elems <= 0 or src_elem_bytes <= 0: - print( - "[ERROR] Invalid packed mask compare arguments: " - f"logical_elems={logical_elems} src_elem_bytes={src_elem_bytes}" - ) - return False - - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - try: - prefix_bytes = _packed_pred_storage_bytes(logical_elems, src_elem_bytes) - except ValueError as exc: - print(f"[ERROR] {exc}") - return False - - if golden.size < prefix_bytes or output.size < prefix_bytes: - print( - f"[ERROR] Packed mask buffer too small: need={prefix_bytes} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - - golden_sel = golden[:prefix_bytes] - output_sel = output[:prefix_bytes] - - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, " - f"logical_elems={logical_elems}, src_elem_bytes={src_elem_bytes}, prefix_bytes={prefix_bytes})" - ) - return False - return True - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = True - ok = compare_bin("golden_v2.bin", "v2.bin", np.float32, 0.0001) and ok - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/unary-vector/vabs-f32-exceptional/golden.py b/test/vpto/cases/micro-op/unary-vector/vabs-f32-exceptional/golden.py deleted file mode 100644 index 95f77e83a..000000000 --- a/test/vpto/cases/micro-op/unary-vector/vabs-f32-exceptional/golden.py +++ /dev/null @@ -1,48 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -ROWS = 32 -COLS = 32 -SEED = 19 - - -def generate(output_dir: Path, seed: int) -> None: - del seed - specials = np.array( - [-np.inf, -7.5, -0.0, 0.0, 1.0, np.inf, np.nan, 3.5], - dtype=np.float32, - ) - v1 = np.resize(specials, ROWS * COLS).reshape(ROWS, COLS).astype(np.float32) - v2 = np.zeros((ROWS, COLS), dtype=np.float32) - golden_v2 = np.abs(v1).astype(np.float32, copy=False) - - output_dir.mkdir(parents=True, exist_ok=True) - v1.reshape(-1).tofile(output_dir / "v1.bin") - v2.reshape(-1).tofile(output_dir / "v2.bin") - golden_v2.reshape(-1).tofile(output_dir / "golden_v2.bin") - - -def main() -> None: - parser = argparse.ArgumentParser() - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/unary-vector/vabs-f32-exceptional/kernel.pto b/test/vpto/cases/micro-op/unary-vector/vabs-f32-exceptional/kernel.pto deleted file mode 100644 index e6eb1c661..000000000 --- a/test/vpto/cases/micro-op/unary-vector/vabs-f32-exceptional/kernel.pto +++ /dev/null @@ -1,42 +0,0 @@ -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vabs_f32_exceptional_kernel_2d(%arg0: !pto.ptr, %arg1: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c64 = arith.constant 64 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c128_i64 = arith.constant 128 : i64 - %c4096_i64 = arith.constant 4096 : i64 - %c1024_i32 = arith.constant 1024 : i32 - - %ub_in = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c4096_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_in, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %_:1 = scf.for %offset = %c0 to %c1024 step %c64 iter_args(%remaining = %c1024_i32) -> (i32) { - %mask, %next_remaining = pto.plt_b32 %remaining : i32 -> !pto.mask, i32 - %vec = pto.vlds %ub_in[%offset] : !pto.ptr -> !pto.vreg<64xf32> - %out = pto.vabs %vec, %mask : !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> - pto.vsts %out, %ub_out[%offset], %mask : !pto.vreg<64xf32>, !pto.ptr, !pto.mask - scf.yield %next_remaining : i32 - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg1, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/unary-vector/vabs-f32-exceptional/launch.cpp b/test/vpto/cases/micro-op/unary-vector/vabs-f32-exceptional/launch.cpp deleted file mode 100644 index 806579491..000000000 --- a/test/vpto/cases/micro-op/unary-vector/vabs-f32-exceptional/launch.cpp +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include - -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vabs_f32_exceptional_kernel_2d(__gm__ float *v1, - __gm__ float *v2); - -void LaunchVabs_f32_exceptional_kernel_2d(float *v1, float *v2, void *stream) { - vabs_f32_exceptional_kernel_2d<<<1, nullptr, stream>>>((__gm__ float *)v1, - (__gm__ float *)v2); -} diff --git a/test/vpto/cases/micro-op/unary-vector/vabs-f32-exceptional/main.cpp b/test/vpto/cases/micro-op/unary-vector/vabs-f32-exceptional/main.cpp deleted file mode 100644 index b3312f7e2..000000000 --- a/test/vpto/cases/micro-op/unary-vector/vabs-f32-exceptional/main.cpp +++ /dev/null @@ -1,87 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include - -using namespace PtoTestCommon; - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, \ - (int)_ret, __FILE__, __LINE__); \ - const char *_recent = aclGetRecentErrMsg(); \ - if (_recent != nullptr && _recent[0] != '\0') \ - std::fprintf(stderr, "[ERROR] RecentErrMsg: %s\n", _recent); \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVabs_f32_exceptional_kernel_2d(float *v1, float *v2, void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(float); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(float); - float *v1Host = nullptr; - float *v1Device = nullptr; - float *v2Host = nullptr; - float *v2Device = nullptr; - - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) - deviceId = std::atoi(envDevice); - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVabs_f32_exceptional_kernel_2d(v1Device, v2Device, stream); - - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v2Host, fileSize_v2, v2Device, fileSize_v2, ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v2.bin", v2Host, fileSize_v2); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - if (stream != nullptr) - aclrtDestroyStream(stream); - if (deviceSet) - aclrtResetDevice(deviceId); - if (aclInited) - aclFinalize(); - return rc; -} diff --git a/test/vpto/cases/micro-op/unary-vector/vabs-i16-signed-overflow-edge/compare.py b/test/vpto/cases/micro-op/unary-vector/vabs-i16-signed-overflow-edge/compare.py deleted file mode 100644 index 672b2df43..000000000 --- a/test/vpto/cases/micro-op/unary-vector/vabs-i16-signed-overflow-edge/compare.py +++ /dev/null @@ -1,42 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/unary-vector/vabs-i16-signed-overflow-edge -# family: unary-vector -# target_ops: pto.vabs -# scenarios: core-i16-signed, full-mask, integer-overflow - -import os -import sys - -import numpy as np - - -def compare_bin(golden_path: str, output_path: str) -> bool: - if not os.path.exists(golden_path) or not os.path.exists(output_path): - return False - golden = np.fromfile(golden_path, dtype=np.int16) - output = np.fromfile(output_path, dtype=np.int16) - return golden.shape == output.shape and np.array_equal(golden, output) - - -def main() -> None: - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_bin("golden_v2.bin", "v2.bin") - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/unary-vector/vabs-i16-signed-overflow-edge/golden.py b/test/vpto/cases/micro-op/unary-vector/vabs-i16-signed-overflow-edge/golden.py deleted file mode 100644 index e8562fdac..000000000 --- a/test/vpto/cases/micro-op/unary-vector/vabs-i16-signed-overflow-edge/golden.py +++ /dev/null @@ -1,51 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/unary-vector/vabs-i16-signed-overflow-edge -# family: unary-vector -# target_ops: pto.vabs -# scenarios: core-i16-signed, full-mask, integer-overflow - -import argparse -from pathlib import Path - -import numpy as np - - -ELEMS = 1024 -SEED = 19 - - -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) - data = rng.integers(-30000, 30000, size=ELEMS, dtype=np.int16) - edge = np.array( - [-32768, -32767, -12345, -1, 0, 1, 12345, 32767, - -32768, -2, 2, -32766, 32766, -1024, 1024, -17], - dtype=np.int16, - ) - data[:edge.size] = edge - golden = np.abs(data).astype(np.int16, copy=False) - - output_dir.mkdir(parents=True, exist_ok=True) - data.tofile(output_dir / "v1.bin") - np.zeros(ELEMS, dtype=np.int16).tofile(output_dir / "v2.bin") - golden.tofile(output_dir / "golden_v2.bin") - - -def main() -> None: - parser = argparse.ArgumentParser() - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/unary-vector/vabs-i16-signed-overflow-edge/kernel.pto b/test/vpto/cases/micro-op/unary-vector/vabs-i16-signed-overflow-edge/kernel.pto deleted file mode 100644 index 199d3ebc6..000000000 --- a/test/vpto/cases/micro-op/unary-vector/vabs-i16-signed-overflow-edge/kernel.pto +++ /dev/null @@ -1,49 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/unary-vector/vabs-i16-signed-overflow-edge -// family: unary-vector -// target_ops: pto.vabs -// scenarios: core-i16-signed, full-mask, integer-overflow -// ----------------------------------------------------------------------------- -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vabs_i16_signed_overflow_edge_kernel(%arg0: !pto.ptr, - %arg1: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c128 = arith.constant 128 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c64_i64 = arith.constant 64 : i64 - %c2048_i64 = arith.constant 2048 : i64 - %c1024_i32 = arith.constant 1024 : i32 - - %ub_in = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c2048_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_in, %c0_i64, %c64_i64 - nburst(%c32_i64, %c64_i64, %c64_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %_:1 = scf.for %offset = %c0 to %c1024 step %c128 iter_args(%remaining = %c1024_i32) -> (i32) { - %mask, %next_remaining = pto.plt_b16 %remaining : i32 -> !pto.mask, i32 - %vec = pto.vlds %ub_in[%offset] : !pto.ptr -> !pto.vreg<128xi16> - %out = pto.vabs %vec, %mask : !pto.vreg<128xi16>, !pto.mask -> !pto.vreg<128xi16> - pto.vsts %out, %ub_out[%offset], %mask : !pto.vreg<128xi16>, !pto.ptr, !pto.mask - scf.yield %next_remaining : i32 - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg1, %c64_i64 - nburst(%c32_i64, %c64_i64, %c64_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/unary-vector/vabs-i16-signed-overflow-edge/launch.cpp b/test/vpto/cases/micro-op/unary-vector/vabs-i16-signed-overflow-edge/launch.cpp deleted file mode 100644 index be3498f7e..000000000 --- a/test/vpto/cases/micro-op/unary-vector/vabs-i16-signed-overflow-edge/launch.cpp +++ /dev/null @@ -1,49 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// --------------------------------------------------------------------------- -// PTOAS compatibility layer -// --------------------------------------------------------------------------- -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include - -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vabs_i16_signed_overflow_edge_kernel( - __gm__ int16_t *v1, __gm__ int16_t *v2); - -void LaunchVabs_i16_signed_overflow_edge_kernel(int16_t *v1, int16_t *v2, - void *stream) { - vabs_i16_signed_overflow_edge_kernel<<<1, nullptr, stream>>>( - (__gm__ int16_t *)v1, (__gm__ int16_t *)v2); -} diff --git a/test/vpto/cases/micro-op/unary-vector/vabs-i16-signed-overflow-edge/main.cpp b/test/vpto/cases/micro-op/unary-vector/vabs-i16-signed-overflow-edge/main.cpp deleted file mode 100644 index 55de29f79..000000000 --- a/test/vpto/cases/micro-op/unary-vector/vabs-i16-signed-overflow-edge/main.cpp +++ /dev/null @@ -1,103 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/unary-vector/vabs-i16-signed-overflow-edge -// family: unary-vector -// target_ops: pto.vabs -// scenarios: core-i16-signed, full-mask, integer-overflow -// ----------------------------------------------------------------------------- -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include -#include - -using namespace PtoTestCommon; - -#ifndef TMRGSORT_HPP -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, \ - (int)_ret, __FILE__, __LINE__); \ - const char *_recent = aclGetRecentErrMsg(); \ - if (_recent != nullptr && _recent[0] != '\0') \ - std::fprintf(stderr, "[ERROR] RecentErrMsg: %s\n", _recent); \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVabs_i16_signed_overflow_edge_kernel(int16_t *v1, int16_t *v2, - void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(int16_t); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(int16_t); - int16_t *v1Host = nullptr; - int16_t *v1Device = nullptr; - int16_t *v2Host = nullptr; - int16_t *v2Device = nullptr; - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) - deviceId = std::atoi(envDevice); - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, - ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVabs_i16_signed_overflow_edge_kernel(v1Device, v2Device, stream); - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v2Host, fileSize_v2, v2Device, fileSize_v2, - ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v2.bin", v2Host, fileSize_v2); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - if (stream != nullptr) - aclrtDestroyStream(stream); - if (deviceSet) - aclrtResetDevice(deviceId); - if (aclInited) - aclFinalize(); - return rc; -} diff --git a/test/vpto/cases/micro-op/unary-vector/vabs-i16-signed/compare.py b/test/vpto/cases/micro-op/unary-vector/vabs-i16-signed/compare.py deleted file mode 100755 index eca2ddc70..000000000 --- a/test/vpto/cases/micro-op/unary-vector/vabs-i16-signed/compare.py +++ /dev/null @@ -1,209 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/unary-vector/vabs-i16-signed -# family: unary-vector -# target_ops: pto.vabs -# scenarios: core-i16-signed, full-mask -# NOTE: bulk-generated coverage skeleton. -# coding=utf-8 - -import os -import sys -import numpy as np - - -REPEAT_BYTES = 256 - - -def _ceil_div(x, y): - return (x + y - 1) // y - - -def _packed_pred_storage_bytes(logical_elems, src_elem_bytes): - logical_elems = int(logical_elems) - src_elem_bytes = int(src_elem_bytes) - if logical_elems <= 0: - raise ValueError(f"logical_elems must be > 0, got {logical_elems}") - if src_elem_bytes not in (1, 2, 4): - raise ValueError(f"unsupported packed predicate source size: {src_elem_bytes}") - - repeat_elems = REPEAT_BYTES // src_elem_bytes - if src_elem_bytes == 4: - repeat_times = _ceil_div(logical_elems, repeat_elems) + 1 - loop_count = repeat_times // 2 - return loop_count * 16 - - repeat_times = _ceil_div(logical_elems, repeat_elems) - return repeat_times * (repeat_elems // 8) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - g = golden.astype(np.int64, copy=False) - o = output.astype(np.int64, copy=False) - else: - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - abs_diff = np.abs(g - o) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={g[idx]}, out={o[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_bin_prefix(golden_path, output_path, dtype, eps, count): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - try: - count = int(count) - except Exception: - print(f"[ERROR] Invalid prefix count: {count}") - return False - if count <= 0: - print(f"[ERROR] Invalid prefix count: {count}") - return False - - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np, count=count) - output = np.fromfile(output_path, dtype=dtype_np, count=count) - - if golden.size != count or output.size != count: - print( - f"[ERROR] Prefix read too small: need={count} elems, " - f"golden={golden.size}, out={output.size}" - ) - return False - - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - g = golden.astype(np.int64, copy=False) - o = output.astype(np.int64, copy=False) - else: - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - abs_diff = np.abs(g - o) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch (prefix): {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={g[idx]}, out={o[idx]}, dtype={dtype_np}, count={count})" - ) - else: - print(f"[ERROR] Mismatch (prefix): {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, logical_elems, src_elem_bytes): - """ - Compare outputs of pto.tcmp / pto.tcmps. - - PTO-ISA stores packed predicate results as a linear PK byte stream via - `psts`, with the exact written prefix length determined by the typed - TCMP/TCMPS repeat schedule. Compare only that semantic prefix. - """ - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - try: - logical_elems = int(logical_elems) - src_elem_bytes = int(src_elem_bytes) - except Exception: - print( - "[ERROR] Invalid packed mask compare arguments: " - f"logical_elems={logical_elems} src_elem_bytes={src_elem_bytes}" - ) - return False - if logical_elems <= 0 or src_elem_bytes <= 0: - print( - "[ERROR] Invalid packed mask compare arguments: " - f"logical_elems={logical_elems} src_elem_bytes={src_elem_bytes}" - ) - return False - - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - try: - prefix_bytes = _packed_pred_storage_bytes(logical_elems, src_elem_bytes) - except ValueError as exc: - print(f"[ERROR] {exc}") - return False - - if golden.size < prefix_bytes or output.size < prefix_bytes: - print( - f"[ERROR] Packed mask buffer too small: need={prefix_bytes} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - - golden_sel = golden[:prefix_bytes] - output_sel = output[:prefix_bytes] - - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, " - f"logical_elems={logical_elems}, src_elem_bytes={src_elem_bytes}, prefix_bytes={prefix_bytes})" - ) - return False - return True - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = True - ok = compare_bin("golden_v2.bin", "v2.bin", np.float32, 0.0001) and ok - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/unary-vector/vabs-i16-signed/golden.py b/test/vpto/cases/micro-op/unary-vector/vabs-i16-signed/golden.py deleted file mode 100755 index ae05da408..000000000 --- a/test/vpto/cases/micro-op/unary-vector/vabs-i16-signed/golden.py +++ /dev/null @@ -1,51 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/unary-vector/vabs-i16-signed -# family: unary-vector -# target_ops: pto.vabs -# scenarios: core-i16-signed, full-mask -# NOTE: bulk-generated coverage skeleton. -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -ROWS = 32 -COLS = 32 -SEED = 19 - - -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) - v1 = rng.uniform(-8.0, 8.0, size=(ROWS, COLS)).astype(np.float32) - v2 = np.zeros((ROWS, COLS), dtype=np.float32) - golden_v2 = np.abs(v1).astype(np.float32, copy=False) - - output_dir.mkdir(parents=True, exist_ok=True) - v1.reshape(-1).tofile(output_dir / "v1.bin") - v2.reshape(-1).tofile(output_dir / "v2.bin") - golden_v2.reshape(-1).tofile(output_dir / "golden_v2.bin") - - -def main() -> None: - parser = argparse.ArgumentParser( - description="Generate numpy-based inputs/golden for VPTO micro-op vabs validation." - ) - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/unary-vector/vabs-i16-signed/kernel.pto b/test/vpto/cases/micro-op/unary-vector/vabs-i16-signed/kernel.pto deleted file mode 100644 index 79d01c359..000000000 --- a/test/vpto/cases/micro-op/unary-vector/vabs-i16-signed/kernel.pto +++ /dev/null @@ -1,70 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/unary-vector/vabs-i16-signed -// family: unary-vector -// target_ops: pto.vabs -// scenarios: core-i16-signed, full-mask -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -// ============================================================================= -// abs_kernel_2d: Element-wise absolute value on a 32x32 f32 tile -// ============================================================================= -// This kernel computes abs(input) for a 32x32 float32 matrix (1024 elements). -// -// Memory Layout: -// - Input: arg0 -> GM (Global Memory) -// - Output: arg1 -> GM (Global Memory) -// - UB (Unified Buffer) at offset 0: input tile (4096 bytes = 32*32*4) -// - UB at offset 4096: output tile (4096 bytes = 32*32*4) -// -// Pipeline: -// 1. DMA: GM -> UB (MTE2 pipe) - copy input tile to UB -// 2. Sync: wait for MTE2 -> V pipe handoff -// 3. Compute: vabs on 64-element vectors (V pipe) - 16 iterations for 1024 elements -// 4. Sync: wait for V -> MTE3 pipe handoff -// 5. DMA: UB -> GM (MTE3 pipe) - copy result tile back to GM -// ============================================================================= - -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vabs_kernel_2d(%arg0: !pto.ptr, %arg1: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c64 = arith.constant 64 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c128_i64 = arith.constant 128 : i64 - %c4096_i64 = arith.constant 4096 : i64 - %c1024_i32 = arith.constant 1024 : i32 - - %ub_in = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c4096_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_in, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %_:1 = scf.for %offset = %c0 to %c1024 step %c64 iter_args(%remaining = %c1024_i32) -> (i32) { - %mask, %next_remaining = pto.plt_b32 %remaining : i32 -> !pto.mask, i32 - %vec = pto.vlds %ub_in[%offset] : !pto.ptr -> !pto.vreg<64xf32> - %out = pto.vabs %vec, %mask : !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> - pto.vsts %out, %ub_out[%offset], %mask : !pto.vreg<64xf32>, !pto.ptr, !pto.mask - scf.yield %next_remaining : i32 - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg1, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/unary-vector/vabs-i16-signed/launch.cpp b/test/vpto/cases/micro-op/unary-vector/vabs-i16-signed/launch.cpp deleted file mode 100644 index 1d4dc5556..000000000 --- a/test/vpto/cases/micro-op/unary-vector/vabs-i16-signed/launch.cpp +++ /dev/null @@ -1,70 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/unary-vector/vabs-i16-signed -// family: unary-vector -// target_ops: pto.vabs -// scenarios: core-i16-signed, full-mask -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -// --------------------------------------------------------------------------- -// PTOAS compatibility layer -// -// The upstream pto-isa headers reference some FP8/FP4 types and the -// __VEC_SCOPE__ marker that are not available on every AICore arch/toolchain -// combination (e.g. __NPU_ARCH__==2201). -// -// For our PTOAS-generated kernels we don't rely on these types today, but the -// headers still mention them in templates/static_asserts. Provide minimal -// fallbacks to keep compilation working on dav-c220. -// --------------------------------------------------------------------------- -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include - -// AICore printf support is gated behind `--cce-enable-print` on some -// toolchains. When enabled, include the CCE print header so `cce::printf` -// resolves in device compilation. -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -// Some PTO-ISA types are only available in the __CCE_AICORE__ compilation -// path, but `bisheng -xcce` still performs a host-side parse pass. -// Provide minimal fallbacks only when the corresponding header wasn't -// pulled in by the selected arch implementation. -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vabs_kernel_2d(__gm__ float *v1, - __gm__ float *v2); - -void LaunchVabs_kernel_2d(float *v1, float *v2, void *stream) { - vabs_kernel_2d<<<1, nullptr, stream>>>((__gm__ float *)v1, (__gm__ float *)v2); -} diff --git a/test/vpto/cases/micro-op/unary-vector/vabs-i16-signed/main.cpp b/test/vpto/cases/micro-op/unary-vector/vabs-i16-signed/main.cpp deleted file mode 100644 index 565d8c357..000000000 --- a/test/vpto/cases/micro-op/unary-vector/vabs-i16-signed/main.cpp +++ /dev/null @@ -1,130 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/unary-vector/vabs-i16-signed -// family: unary-vector -// target_ops: pto.vabs -// scenarios: core-i16-signed, full-mask -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -This program is free software, you can redistribute it and/or modify it under the terms and conditions of -CANN Open Software License Agreement Version 2.0 (the "License"). -Please refer to the License for details. You may not use this file except in compliance with the License. -THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -See LICENSE in the root of the software repository for the full text of the License. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include -#include - -using namespace PtoTestCommon; - -#ifndef TMRGSORT_HPP -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, (int)_ret, __FILE__, __LINE__); \ - const char *_recent = aclGetRecentErrMsg(); \ - if (_recent != nullptr && _recent[0] != '\0') { \ - std::fprintf(stderr, "[ERROR] RecentErrMsg: %s\n", _recent); \ - } \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVabs_kernel_2d(float *v1, float *v2, void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(float); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(float); - float *v1Host = nullptr; - float *v1Device = nullptr; - float *v2Host = nullptr; - float *v2Device = nullptr; - - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { - deviceId = std::atoi(envDevice); - } - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVabs_kernel_2d(v1Device, v2Device, stream); - - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v2Host, fileSize_v2, v2Device, fileSize_v2, ACL_MEMCPY_DEVICE_TO_HOST)); - - WriteFile("./v2.bin", v2Host, fileSize_v2); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - if (stream != nullptr) { - const aclError _ret = aclrtDestroyStream(stream); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtDestroyStream(stream)", (int)_ret, __FILE__, __LINE__); - } - stream = nullptr; - } - if (deviceSet) { - const aclError _ret = aclrtResetDevice(deviceId); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtResetDevice(deviceId)", (int)_ret, __FILE__, __LINE__); - } - } - if (aclInited) { - const aclError _ret = aclFinalize(); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclFinalize()", (int)_ret, __FILE__, __LINE__); - } - } - - return rc; -} diff --git a/test/vpto/cases/micro-op/unary-vector/vabs-i16-unsigned/compare.py b/test/vpto/cases/micro-op/unary-vector/vabs-i16-unsigned/compare.py deleted file mode 100755 index a6d5c46f7..000000000 --- a/test/vpto/cases/micro-op/unary-vector/vabs-i16-unsigned/compare.py +++ /dev/null @@ -1,209 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/unary-vector/vabs-i16-unsigned -# family: unary-vector -# target_ops: pto.vabs -# scenarios: core-i16-unsigned, full-mask -# NOTE: bulk-generated coverage skeleton. -# coding=utf-8 - -import os -import sys -import numpy as np - - -REPEAT_BYTES = 256 - - -def _ceil_div(x, y): - return (x + y - 1) // y - - -def _packed_pred_storage_bytes(logical_elems, src_elem_bytes): - logical_elems = int(logical_elems) - src_elem_bytes = int(src_elem_bytes) - if logical_elems <= 0: - raise ValueError(f"logical_elems must be > 0, got {logical_elems}") - if src_elem_bytes not in (1, 2, 4): - raise ValueError(f"unsupported packed predicate source size: {src_elem_bytes}") - - repeat_elems = REPEAT_BYTES // src_elem_bytes - if src_elem_bytes == 4: - repeat_times = _ceil_div(logical_elems, repeat_elems) + 1 - loop_count = repeat_times // 2 - return loop_count * 16 - - repeat_times = _ceil_div(logical_elems, repeat_elems) - return repeat_times * (repeat_elems // 8) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - g = golden.astype(np.int64, copy=False) - o = output.astype(np.int64, copy=False) - else: - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - abs_diff = np.abs(g - o) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={g[idx]}, out={o[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_bin_prefix(golden_path, output_path, dtype, eps, count): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - try: - count = int(count) - except Exception: - print(f"[ERROR] Invalid prefix count: {count}") - return False - if count <= 0: - print(f"[ERROR] Invalid prefix count: {count}") - return False - - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np, count=count) - output = np.fromfile(output_path, dtype=dtype_np, count=count) - - if golden.size != count or output.size != count: - print( - f"[ERROR] Prefix read too small: need={count} elems, " - f"golden={golden.size}, out={output.size}" - ) - return False - - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - g = golden.astype(np.int64, copy=False) - o = output.astype(np.int64, copy=False) - else: - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - abs_diff = np.abs(g - o) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch (prefix): {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={g[idx]}, out={o[idx]}, dtype={dtype_np}, count={count})" - ) - else: - print(f"[ERROR] Mismatch (prefix): {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, logical_elems, src_elem_bytes): - """ - Compare outputs of pto.tcmp / pto.tcmps. - - PTO-ISA stores packed predicate results as a linear PK byte stream via - `psts`, with the exact written prefix length determined by the typed - TCMP/TCMPS repeat schedule. Compare only that semantic prefix. - """ - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - try: - logical_elems = int(logical_elems) - src_elem_bytes = int(src_elem_bytes) - except Exception: - print( - "[ERROR] Invalid packed mask compare arguments: " - f"logical_elems={logical_elems} src_elem_bytes={src_elem_bytes}" - ) - return False - if logical_elems <= 0 or src_elem_bytes <= 0: - print( - "[ERROR] Invalid packed mask compare arguments: " - f"logical_elems={logical_elems} src_elem_bytes={src_elem_bytes}" - ) - return False - - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - try: - prefix_bytes = _packed_pred_storage_bytes(logical_elems, src_elem_bytes) - except ValueError as exc: - print(f"[ERROR] {exc}") - return False - - if golden.size < prefix_bytes or output.size < prefix_bytes: - print( - f"[ERROR] Packed mask buffer too small: need={prefix_bytes} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - - golden_sel = golden[:prefix_bytes] - output_sel = output[:prefix_bytes] - - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, " - f"logical_elems={logical_elems}, src_elem_bytes={src_elem_bytes}, prefix_bytes={prefix_bytes})" - ) - return False - return True - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = True - ok = compare_bin("golden_v2.bin", "v2.bin", np.float32, 0.0001) and ok - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/unary-vector/vabs-i16-unsigned/golden.py b/test/vpto/cases/micro-op/unary-vector/vabs-i16-unsigned/golden.py deleted file mode 100755 index 6f75723de..000000000 --- a/test/vpto/cases/micro-op/unary-vector/vabs-i16-unsigned/golden.py +++ /dev/null @@ -1,51 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/unary-vector/vabs-i16-unsigned -# family: unary-vector -# target_ops: pto.vabs -# scenarios: core-i16-unsigned, full-mask -# NOTE: bulk-generated coverage skeleton. -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -ROWS = 32 -COLS = 32 -SEED = 19 - - -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) - v1 = rng.uniform(-8.0, 8.0, size=(ROWS, COLS)).astype(np.float32) - v2 = np.zeros((ROWS, COLS), dtype=np.float32) - golden_v2 = np.abs(v1).astype(np.float32, copy=False) - - output_dir.mkdir(parents=True, exist_ok=True) - v1.reshape(-1).tofile(output_dir / "v1.bin") - v2.reshape(-1).tofile(output_dir / "v2.bin") - golden_v2.reshape(-1).tofile(output_dir / "golden_v2.bin") - - -def main() -> None: - parser = argparse.ArgumentParser( - description="Generate numpy-based inputs/golden for VPTO micro-op vabs validation." - ) - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/unary-vector/vabs-i16-unsigned/kernel.pto b/test/vpto/cases/micro-op/unary-vector/vabs-i16-unsigned/kernel.pto deleted file mode 100644 index d68983556..000000000 --- a/test/vpto/cases/micro-op/unary-vector/vabs-i16-unsigned/kernel.pto +++ /dev/null @@ -1,70 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/unary-vector/vabs-i16-unsigned -// family: unary-vector -// target_ops: pto.vabs -// scenarios: core-i16-unsigned, full-mask -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -// ============================================================================= -// abs_kernel_2d: Element-wise absolute value on a 32x32 f32 tile -// ============================================================================= -// This kernel computes abs(input) for a 32x32 float32 matrix (1024 elements). -// -// Memory Layout: -// - Input: arg0 -> GM (Global Memory) -// - Output: arg1 -> GM (Global Memory) -// - UB (Unified Buffer) at offset 0: input tile (4096 bytes = 32*32*4) -// - UB at offset 4096: output tile (4096 bytes = 32*32*4) -// -// Pipeline: -// 1. DMA: GM -> UB (MTE2 pipe) - copy input tile to UB -// 2. Sync: wait for MTE2 -> V pipe handoff -// 3. Compute: vabs on 64-element vectors (V pipe) - 16 iterations for 1024 elements -// 4. Sync: wait for V -> MTE3 pipe handoff -// 5. DMA: UB -> GM (MTE3 pipe) - copy result tile back to GM -// ============================================================================= - -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vabs_kernel_2d(%arg0: !pto.ptr, %arg1: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c64 = arith.constant 64 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c128_i64 = arith.constant 128 : i64 - %c4096_i64 = arith.constant 4096 : i64 - %c1024_i32 = arith.constant 1024 : i32 - - %ub_in = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c4096_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_in, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %_:1 = scf.for %offset = %c0 to %c1024 step %c64 iter_args(%remaining = %c1024_i32) -> (i32) { - %mask, %next_remaining = pto.plt_b32 %remaining : i32 -> !pto.mask, i32 - %vec = pto.vlds %ub_in[%offset] : !pto.ptr -> !pto.vreg<64xf32> - %out = pto.vabs %vec, %mask : !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> - pto.vsts %out, %ub_out[%offset], %mask : !pto.vreg<64xf32>, !pto.ptr, !pto.mask - scf.yield %next_remaining : i32 - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg1, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/unary-vector/vabs-i16-unsigned/launch.cpp b/test/vpto/cases/micro-op/unary-vector/vabs-i16-unsigned/launch.cpp deleted file mode 100644 index 8ae1a4350..000000000 --- a/test/vpto/cases/micro-op/unary-vector/vabs-i16-unsigned/launch.cpp +++ /dev/null @@ -1,70 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/unary-vector/vabs-i16-unsigned -// family: unary-vector -// target_ops: pto.vabs -// scenarios: core-i16-unsigned, full-mask -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -// --------------------------------------------------------------------------- -// PTOAS compatibility layer -// -// The upstream pto-isa headers reference some FP8/FP4 types and the -// __VEC_SCOPE__ marker that are not available on every AICore arch/toolchain -// combination (e.g. __NPU_ARCH__==2201). -// -// For our PTOAS-generated kernels we don't rely on these types today, but the -// headers still mention them in templates/static_asserts. Provide minimal -// fallbacks to keep compilation working on dav-c220. -// --------------------------------------------------------------------------- -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include - -// AICore printf support is gated behind `--cce-enable-print` on some -// toolchains. When enabled, include the CCE print header so `cce::printf` -// resolves in device compilation. -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -// Some PTO-ISA types are only available in the __CCE_AICORE__ compilation -// path, but `bisheng -xcce` still performs a host-side parse pass. -// Provide minimal fallbacks only when the corresponding header wasn't -// pulled in by the selected arch implementation. -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vabs_kernel_2d(__gm__ float *v1, - __gm__ float *v2); - -void LaunchVabs_kernel_2d(float *v1, float *v2, void *stream) { - vabs_kernel_2d<<<1, nullptr, stream>>>((__gm__ float *)v1, (__gm__ float *)v2); -} diff --git a/test/vpto/cases/micro-op/unary-vector/vabs-i16-unsigned/main.cpp b/test/vpto/cases/micro-op/unary-vector/vabs-i16-unsigned/main.cpp deleted file mode 100644 index d54791913..000000000 --- a/test/vpto/cases/micro-op/unary-vector/vabs-i16-unsigned/main.cpp +++ /dev/null @@ -1,130 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/unary-vector/vabs-i16-unsigned -// family: unary-vector -// target_ops: pto.vabs -// scenarios: core-i16-unsigned, full-mask -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -This program is free software, you can redistribute it and/or modify it under the terms and conditions of -CANN Open Software License Agreement Version 2.0 (the "License"). -Please refer to the License for details. You may not use this file except in compliance with the License. -THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -See LICENSE in the root of the software repository for the full text of the License. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include -#include - -using namespace PtoTestCommon; - -#ifndef TMRGSORT_HPP -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, (int)_ret, __FILE__, __LINE__); \ - const char *_recent = aclGetRecentErrMsg(); \ - if (_recent != nullptr && _recent[0] != '\0') { \ - std::fprintf(stderr, "[ERROR] RecentErrMsg: %s\n", _recent); \ - } \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVabs_kernel_2d(float *v1, float *v2, void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(float); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(float); - float *v1Host = nullptr; - float *v1Device = nullptr; - float *v2Host = nullptr; - float *v2Device = nullptr; - - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { - deviceId = std::atoi(envDevice); - } - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVabs_kernel_2d(v1Device, v2Device, stream); - - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v2Host, fileSize_v2, v2Device, fileSize_v2, ACL_MEMCPY_DEVICE_TO_HOST)); - - WriteFile("./v2.bin", v2Host, fileSize_v2); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - if (stream != nullptr) { - const aclError _ret = aclrtDestroyStream(stream); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtDestroyStream(stream)", (int)_ret, __FILE__, __LINE__); - } - stream = nullptr; - } - if (deviceSet) { - const aclError _ret = aclrtResetDevice(deviceId); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtResetDevice(deviceId)", (int)_ret, __FILE__, __LINE__); - } - } - if (aclInited) { - const aclError _ret = aclFinalize(); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclFinalize()", (int)_ret, __FILE__, __LINE__); - } - } - - return rc; -} diff --git a/test/vpto/cases/micro-op/unary-vector/vabs-loop-carried-vreg/compare.py b/test/vpto/cases/micro-op/unary-vector/vabs-loop-carried-vreg/compare.py deleted file mode 100644 index 6098dd82c..000000000 --- a/test/vpto/cases/micro-op/unary-vector/vabs-loop-carried-vreg/compare.py +++ /dev/null @@ -1,198 +0,0 @@ -#!/usr/bin/python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# coding=utf-8 - -import os -import sys -import numpy as np - - -REPEAT_BYTES = 256 -ACTIVE_ELEMS = 1000 - - -def _ceil_div(x, y): - return (x + y - 1) // y - - -def _packed_pred_storage_bytes(logical_elems, src_elem_bytes): - logical_elems = int(logical_elems) - src_elem_bytes = int(src_elem_bytes) - if logical_elems <= 0: - raise ValueError(f"logical_elems must be > 0, got {logical_elems}") - if src_elem_bytes not in (1, 2, 4): - raise ValueError(f"unsupported packed predicate source size: {src_elem_bytes}") - - repeat_elems = REPEAT_BYTES // src_elem_bytes - if src_elem_bytes == 4: - repeat_times = _ceil_div(logical_elems, repeat_elems) + 1 - loop_count = repeat_times // 2 - return loop_count * 16 - - repeat_times = _ceil_div(logical_elems, repeat_elems) - return repeat_times * (repeat_elems // 8) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - g = golden.astype(np.int64, copy=False) - o = output.astype(np.int64, copy=False) - else: - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - abs_diff = np.abs(g - o) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={g[idx]}, out={o[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_bin_prefix(golden_path, output_path, dtype, eps, count): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - try: - count = int(count) - except Exception: - print(f"[ERROR] Invalid prefix count: {count}") - return False - if count <= 0: - print(f"[ERROR] Invalid prefix count: {count}") - return False - - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np, count=count) - output = np.fromfile(output_path, dtype=dtype_np, count=count) - - if golden.size != count or output.size != count: - print( - f"[ERROR] Prefix read too small: need={count} elems, " - f"golden={golden.size}, out={output.size}" - ) - return False - - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - g = golden.astype(np.int64, copy=False) - o = output.astype(np.int64, copy=False) - else: - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - abs_diff = np.abs(g - o) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch (prefix): {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={g[idx]}, out={o[idx]}, dtype={dtype_np}, count={count})" - ) - else: - print(f"[ERROR] Mismatch (prefix): {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, logical_elems, src_elem_bytes): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - try: - logical_elems = int(logical_elems) - src_elem_bytes = int(src_elem_bytes) - except Exception: - print( - "[ERROR] Invalid packed mask compare arguments: " - f"logical_elems={logical_elems} src_elem_bytes={src_elem_bytes}" - ) - return False - if logical_elems <= 0 or src_elem_bytes <= 0: - print( - "[ERROR] Invalid packed mask compare arguments: " - f"logical_elems={logical_elems} src_elem_bytes={src_elem_bytes}" - ) - return False - - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - try: - prefix_bytes = _packed_pred_storage_bytes(logical_elems, src_elem_bytes) - except ValueError as exc: - print(f"[ERROR] {exc}") - return False - - if golden.size < prefix_bytes or output.size < prefix_bytes: - print( - f"[ERROR] Packed mask buffer too small: need={prefix_bytes} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - - golden_sel = golden[:prefix_bytes] - output_sel = output[:prefix_bytes] - - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, " - f"logical_elems={logical_elems}, src_elem_bytes={src_elem_bytes}, prefix_bytes={prefix_bytes})" - ) - return False - return True - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = True - ok = compare_bin_prefix("golden_v2.bin", "v2.bin", np.float32, 0.0001, ACTIVE_ELEMS) and ok - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/unary-vector/vabs-loop-carried-vreg/golden.py b/test/vpto/cases/micro-op/unary-vector/vabs-loop-carried-vreg/golden.py deleted file mode 100644 index 7448a6a1c..000000000 --- a/test/vpto/cases/micro-op/unary-vector/vabs-loop-carried-vreg/golden.py +++ /dev/null @@ -1,74 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -ROWS = 32 -COLS = 32 -ACTIVE_ELEMS = 1000 -SEED = 19 - - -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) - v1 = rng.uniform(-8.0, 8.0, size=(ROWS, COLS)).astype(np.float32) - edge_values = np.array( - [ - -0.0, - 0.0, - -1.0, - 1.0, - -8.0, - 8.0, - -1.0e-30, - 1.0e-30, - -1.0e10, - 1.0e10, - -3.5, - 3.5, - -7.25, - 7.25, - -2.0, - 2.0, - ], - dtype=np.float32, - ) - v1.reshape(-1)[: edge_values.size] = edge_values - v2 = np.zeros((ROWS, COLS), dtype=np.float32) - golden_v2 = np.zeros((ROWS, COLS), dtype=np.float32) - flat_v1 = v1.reshape(-1) - flat_golden_v2 = golden_v2.reshape(-1) - flat_golden_v2[:ACTIVE_ELEMS] = np.abs(flat_v1[:ACTIVE_ELEMS]).astype( - np.float32, copy=False - ) - - output_dir.mkdir(parents=True, exist_ok=True) - v1.reshape(-1).tofile(output_dir / "v1.bin") - v2.reshape(-1).tofile(output_dir / "v2.bin") - golden_v2.reshape(-1).tofile(output_dir / "golden_v2.bin") - - -def main() -> None: - parser = argparse.ArgumentParser( - description="Generate numpy-based inputs/golden for VPTO micro-op vabs loop-carried vreg validation." - ) - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/unary-vector/vabs-loop-carried-vreg/kernel.pto b/test/vpto/cases/micro-op/unary-vector/vabs-loop-carried-vreg/kernel.pto deleted file mode 100644 index 476939d7a..000000000 --- a/test/vpto/cases/micro-op/unary-vector/vabs-loop-carried-vreg/kernel.pto +++ /dev/null @@ -1,50 +0,0 @@ -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vabs_loop_carried_vreg_kernel_2d(%arg0: !pto.ptr, - %arg1: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c2 = arith.constant 2 : index - %c64 = arith.constant 64 : index - %c1000 = arith.constant 1000 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c128_i64 = arith.constant 128 : i64 - %c4096_i64 = arith.constant 4096 : i64 - %c1000_i32 = arith.constant 1000 : i32 - - %ub_in = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c4096_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_in, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %_:1 = scf.for %offset = %c0 to %c1000 step %c64 iter_args(%remaining = %c1000_i32) -> (i32) { - %mask, %next_remaining = pto.plt_b32 %remaining : i32 -> !pto.mask, i32 - %vec = pto.vlds %ub_in[%offset] : !pto.ptr -> !pto.vreg<64xf32> - %out = scf.for %iter = %c0 to %c2 step %c1 - iter_args(%carry = %vec) -> (!pto.vreg<64xf32>) { - %abs = pto.vabs %carry, %mask : !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> - scf.yield %abs : !pto.vreg<64xf32> - } - pto.vsts %out, %ub_out[%offset], %mask : !pto.vreg<64xf32>, !pto.ptr, !pto.mask - scf.yield %next_remaining : i32 - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg1, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/unary-vector/vabs-loop-carried-vreg/launch.cpp b/test/vpto/cases/micro-op/unary-vector/vabs-loop-carried-vreg/launch.cpp deleted file mode 100644 index 2663b7625..000000000 --- a/test/vpto/cases/micro-op/unary-vector/vabs-loop-carried-vreg/launch.cpp +++ /dev/null @@ -1,48 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// --------------------------------------------------------------------------- -// PTOAS compatibility layer -// --------------------------------------------------------------------------- -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include - -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void -vabs_loop_carried_vreg_kernel_2d(__gm__ float *v1, __gm__ float *v2); - -void LaunchVabs_loop_carried_vreg_kernel_2d(float *v1, float *v2, void *stream) { - vabs_loop_carried_vreg_kernel_2d<<<1, nullptr, stream>>>( - (__gm__ float *)v1, (__gm__ float *)v2); -} diff --git a/test/vpto/cases/micro-op/unary-vector/vabs-loop-carried-vreg/main.cpp b/test/vpto/cases/micro-op/unary-vector/vabs-loop-carried-vreg/main.cpp deleted file mode 100644 index 4d4bd221b..000000000 --- a/test/vpto/cases/micro-op/unary-vector/vabs-loop-carried-vreg/main.cpp +++ /dev/null @@ -1,122 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -This program is free software, you can redistribute it and/or modify it under the terms and conditions of -CANN Open Software License Agreement Version 2.0 (the "License"). -Please refer to the License for details. You may not use this file except in compliance with the License. -THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -See LICENSE in the root of the software repository for the full text of the License. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include -#include - -using namespace PtoTestCommon; - -#ifndef TMRGSORT_HPP -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, (int)_ret, __FILE__, __LINE__); \ - const char *_recent = aclGetRecentErrMsg(); \ - if (_recent != nullptr && _recent[0] != '\0') { \ - std::fprintf(stderr, "[ERROR] RecentErrMsg: %s\n", _recent); \ - } \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVabs_loop_carried_vreg_kernel_2d(float *v1, float *v2, void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(float); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(float); - float *v1Host = nullptr; - float *v1Device = nullptr; - float *v2Host = nullptr; - float *v2Device = nullptr; - - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { - deviceId = std::atoi(envDevice); - } - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVabs_loop_carried_vreg_kernel_2d(v1Device, v2Device, stream); - - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v2Host, fileSize_v2, v2Device, fileSize_v2, ACL_MEMCPY_DEVICE_TO_HOST)); - - WriteFile("./v2.bin", v2Host, fileSize_v2); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - if (stream != nullptr) { - const aclError _ret = aclrtDestroyStream(stream); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtDestroyStream(stream)", (int)_ret, __FILE__, __LINE__); - } - stream = nullptr; - } - if (deviceSet) { - const aclError _ret = aclrtResetDevice(deviceId); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtResetDevice(deviceId)", (int)_ret, __FILE__, __LINE__); - } - } - if (aclInited) { - const aclError _ret = aclFinalize(); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclFinalize()", (int)_ret, __FILE__, __LINE__); - } - } - - return rc; -} diff --git a/test/vpto/cases/micro-op/unary-vector/vabs-tail/compare.py b/test/vpto/cases/micro-op/unary-vector/vabs-tail/compare.py deleted file mode 100644 index c13d79273..000000000 --- a/test/vpto/cases/micro-op/unary-vector/vabs-tail/compare.py +++ /dev/null @@ -1,39 +0,0 @@ -#!/usr/bin/python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - - -import os -import sys -import numpy as np - - -def compare_bin_prefix(golden_path, output_path, dtype, eps, count): - if not os.path.exists(golden_path) or not os.path.exists(output_path): - return False - golden = np.fromfile(golden_path, dtype=dtype, count=count) - output = np.fromfile(output_path, dtype=dtype, count=count) - return golden.shape == output.shape and np.allclose( - golden, output, atol=eps, rtol=eps, equal_nan=True - ) - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_bin_prefix("golden_v2.bin", "v2.bin", np.float32, 1e-4, 1000) - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/unary-vector/vabs-tail/golden.py b/test/vpto/cases/micro-op/unary-vector/vabs-tail/golden.py deleted file mode 100644 index 03fd9a768..000000000 --- a/test/vpto/cases/micro-op/unary-vector/vabs-tail/golden.py +++ /dev/null @@ -1,49 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -ROWS = 32 -COLS = 32 -SEED = 19 -LOGICAL_ELEMS = 1000 -OUT_SENTINEL = np.float32(-123.25) - - -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) - v1 = rng.uniform(-8.0, 8.0, size=(ROWS, COLS)).astype(np.float32) - v2 = np.full((ROWS, COLS), OUT_SENTINEL, dtype=np.float32) - golden_v2 = np.full((ROWS, COLS), OUT_SENTINEL, dtype=np.float32) - golden_v2.reshape(-1)[:LOGICAL_ELEMS] = np.abs( - v1.reshape(-1)[:LOGICAL_ELEMS] - ).astype(np.float32, copy=False) - - output_dir.mkdir(parents=True, exist_ok=True) - v1.reshape(-1).tofile(output_dir / "v1.bin") - v2.reshape(-1).tofile(output_dir / "v2.bin") - golden_v2.reshape(-1).tofile(output_dir / "golden_v2.bin") - - -def main() -> None: - parser = argparse.ArgumentParser() - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/unary-vector/vabs-tail/kernel.pto b/test/vpto/cases/micro-op/unary-vector/vabs-tail/kernel.pto deleted file mode 100644 index 00f93abc2..000000000 --- a/test/vpto/cases/micro-op/unary-vector/vabs-tail/kernel.pto +++ /dev/null @@ -1,42 +0,0 @@ -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vabs_tail_kernel_2d(%arg0: !pto.ptr, %arg1: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c64 = arith.constant 64 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c128_i64 = arith.constant 128 : i64 - %c4096_i64 = arith.constant 4096 : i64 - %c1000_i32 = arith.constant 1000 : i32 - - %ub_in = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c4096_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_in, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %_:1 = scf.for %offset = %c0 to %c1024 step %c64 iter_args(%remaining = %c1000_i32) -> (i32) { - %mask, %next_remaining = pto.plt_b32 %remaining : i32 -> !pto.mask, i32 - %vec = pto.vlds %ub_in[%offset] : !pto.ptr -> !pto.vreg<64xf32> - %out = pto.vabs %vec, %mask : !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> - pto.vsts %out, %ub_out[%offset], %mask : !pto.vreg<64xf32>, !pto.ptr, !pto.mask - scf.yield %next_remaining : i32 - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg1, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/unary-vector/vabs-tail/launch.cpp b/test/vpto/cases/micro-op/unary-vector/vabs-tail/launch.cpp deleted file mode 100644 index 494bc5bf3..000000000 --- a/test/vpto/cases/micro-op/unary-vector/vabs-tail/launch.cpp +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include - -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vabs_tail_kernel_2d(__gm__ float *v1, - __gm__ float *v2); - -void LaunchVabs_tail_kernel_2d(float *v1, float *v2, void *stream) { - vabs_tail_kernel_2d<<<1, nullptr, stream>>>((__gm__ float *)v1, (__gm__ float *)v2); -} diff --git a/test/vpto/cases/micro-op/unary-vector/vabs-tail/main.cpp b/test/vpto/cases/micro-op/unary-vector/vabs-tail/main.cpp deleted file mode 100644 index cf25e5dff..000000000 --- a/test/vpto/cases/micro-op/unary-vector/vabs-tail/main.cpp +++ /dev/null @@ -1,87 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include - -using namespace PtoTestCommon; - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, \ - (int)_ret, __FILE__, __LINE__); \ - const char *_recent = aclGetRecentErrMsg(); \ - if (_recent != nullptr && _recent[0] != '\0') \ - std::fprintf(stderr, "[ERROR] RecentErrMsg: %s\n", _recent); \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVabs_tail_kernel_2d(float *v1, float *v2, void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(float); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(float); - float *v1Host = nullptr; - float *v1Device = nullptr; - float *v2Host = nullptr; - float *v2Device = nullptr; - - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) - deviceId = std::atoi(envDevice); - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVabs_tail_kernel_2d(v1Device, v2Device, stream); - - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v2Host, fileSize_v2, v2Device, fileSize_v2, ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v2.bin", v2Host, fileSize_v2); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - if (stream != nullptr) - aclrtDestroyStream(stream); - if (deviceSet) - aclrtResetDevice(deviceId); - if (aclInited) - aclFinalize(); - return rc; -} diff --git a/test/vpto/cases/micro-op/unary-vector/vabs/kernel.pto b/test/vpto/cases/micro-op/unary-vector/vabs/kernel.pto index 75486b4fe..68b161dc6 100644 --- a/test/vpto/cases/micro-op/unary-vector/vabs/kernel.pto +++ b/test/vpto/cases/micro-op/unary-vector/vabs/kernel.pto @@ -1,62 +1,351 @@ -// ============================================================================= -// abs_kernel_2d: Element-wise absolute value on a 32x32 f32 tile -// ============================================================================= -// This kernel computes abs(input) for a 32x32 float32 matrix (1024 elements). -// -// Memory Layout: -// - Input: arg0 -> GM (Global Memory) -// - Output: arg1 -> GM (Global Memory) -// - UB (Unified Buffer) at offset 0: input tile (4096 bytes = 32*32*4) -// - UB at offset 4096: output tile (4096 bytes = 32*32*4) -// -// Pipeline: -// 1. DMA: GM -> UB (MTE2 pipe) - copy input tile to UB -// 2. Sync: wait for MTE2 -> V pipe handoff -// 3. Compute: vabs on 64-element vectors (V pipe) - 16 iterations for 1024 elements -// 4. Sync: wait for V -> MTE3 pipe handoff -// 5. DMA: UB -> GM (MTE3 pipe) - copy result tile back to GM -// ============================================================================= - module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vabs_kernel_2d(%arg0: !pto.ptr, %arg1: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c64 = arith.constant 64 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c128_i64 = arith.constant 128 : i64 - %c4096_i64 = arith.constant 4096 : i64 - %c1024_i32 = arith.constant 1024 : i32 - - %ub_in = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c4096_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_in, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) + func.func @vabs_deep_merged_kernel(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: !pto.ptr, %arg4: !pto.ptr, %arg5: !pto.ptr, %arg6: !pto.ptr, %arg7: !pto.ptr, %arg8: !pto.ptr, %arg9: !pto.ptr, %arg10: !pto.ptr, %arg11: !pto.ptr, %arg12: !pto.ptr, %arg13: !pto.ptr, %arg14: !pto.ptr, %arg15: !pto.ptr) attributes {pto.kernel} { + %__deep_merge_guard = arith.constant false + // inactive merged from vabs_kernel_2d_vabs_f16 + scf.if %__deep_merge_guard { + + %c0_m0 = arith.constant 0 : index + %c1_m0 = arith.constant 1 : index + %c64_m0 = arith.constant 64 : index + %c1024_m0 = arith.constant 1024 : index + %c0_i64_m0 = arith.constant 0 : i64 + %c1_i64_m0 = arith.constant 1 : i64 + %c32_i64_m0 = arith.constant 32 : i64 + %c128_i64_m0 = arith.constant 128 : i64 + %c4096_i64_m0 = arith.constant 4096 : i64 + %c1024_i32_m0 = arith.constant 1024 : i32 + + %ub_in_m0 = pto.castptr %c0_i64_m0 : i64 -> !pto.ptr + %ub_out_m0 = pto.castptr %c4096_i64_m0 : i64 -> !pto.ptr + + %false_m0 = arith.constant false + pto.mte_gm_ub %arg0, %ub_in_m0, %c0_i64_m0, %c128_i64_m0 + nburst(%c32_i64_m0, %c128_i64_m0, %c128_i64_m0) : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] pto.vecscope { - %_:1 = scf.for %offset = %c0 to %c1024 step %c64 iter_args(%remaining = %c1024_i32) -> (i32) { - %mask, %next_remaining = pto.plt_b32 %remaining : i32 -> !pto.mask, i32 - %vec = pto.vlds %ub_in[%offset] : !pto.ptr -> !pto.vreg<64xf32> - %out = pto.vabs %vec, %mask : !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> - pto.vsts %out, %ub_out[%offset], %mask : !pto.vreg<64xf32>, !pto.ptr, !pto.mask - scf.yield %next_remaining : i32 + %__m0:1 = scf.for %offset_m0 = %c0_m0 to %c1024_m0 step %c64_m0 iter_args(%remaining_m0 = %c1024_i32_m0) -> (i32) { + %mask_m0, %next_remaining_m0 = pto.plt_b32 %remaining_m0 : i32 -> !pto.mask, i32 + %vec_m0 = pto.vlds %ub_in_m0[%offset_m0] : !pto.ptr -> !pto.vreg<64xf32> + %out_m0 = pto.vabs %vec_m0, %mask_m0 : !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> + pto.vsts %out_m0, %ub_out_m0[%offset_m0], %mask_m0 : !pto.vreg<64xf32>, !pto.ptr, !pto.mask + scf.yield %next_remaining_m0 : i32 } } pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg1, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) + pto.mte_ub_gm %ub_out_m0, %arg1, %c128_i64_m0 + nburst(%c32_i64_m0, %c128_i64_m0, %c128_i64_m0) : !pto.ptr, !pto.ptr, i64, i64, i64, i64 pto.barrier #pto.pipe + + } + // inactive merged from vabs_f32_exceptional_kernel_2d + scf.if %__deep_merge_guard { + + %c0_m1 = arith.constant 0 : index + %c64_m1 = arith.constant 64 : index + %c1024_m1 = arith.constant 1024 : index + %c0_i64_m1 = arith.constant 0 : i64 + %c1_i64_m1 = arith.constant 1 : i64 + %c32_i64_m1 = arith.constant 32 : i64 + %c128_i64_m1 = arith.constant 128 : i64 + %c4096_i64_m1 = arith.constant 4096 : i64 + %c1024_i32_m1 = arith.constant 1024 : i32 + + %ub_in_m1 = pto.castptr %c0_i64_m1 : i64 -> !pto.ptr + %ub_out_m1 = pto.castptr %c4096_i64_m1 : i64 -> !pto.ptr + + %false_m1 = arith.constant false + pto.mte_gm_ub %arg2, %ub_in_m1, %c0_i64_m1, %c128_i64_m1 + nburst(%c32_i64_m1, %c128_i64_m1, %c128_i64_m1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %__m1:1 = scf.for %offset_m1 = %c0_m1 to %c1024_m1 step %c64_m1 iter_args(%remaining_m1 = %c1024_i32_m1) -> (i32) { + %mask_m1, %next_remaining_m1 = pto.plt_b32 %remaining_m1 : i32 -> !pto.mask, i32 + %vec_m1 = pto.vlds %ub_in_m1[%offset_m1] : !pto.ptr -> !pto.vreg<64xf32> + %out_m1 = pto.vabs %vec_m1, %mask_m1 : !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> + pto.vsts %out_m1, %ub_out_m1[%offset_m1], %mask_m1 : !pto.vreg<64xf32>, !pto.ptr, !pto.mask + scf.yield %next_remaining_m1 : i32 + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m1, %arg3, %c128_i64_m1 + nburst(%c32_i64_m1, %c128_i64_m1, %c128_i64_m1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + + } + // inactive merged from vabs_kernel_2d_vabs_i16_signed + scf.if %__deep_merge_guard { + + %c0_m2 = arith.constant 0 : index + %c1_m2 = arith.constant 1 : index + %c64_m2 = arith.constant 64 : index + %c1024_m2 = arith.constant 1024 : index + %c0_i64_m2 = arith.constant 0 : i64 + %c1_i64_m2 = arith.constant 1 : i64 + %c32_i64_m2 = arith.constant 32 : i64 + %c128_i64_m2 = arith.constant 128 : i64 + %c4096_i64_m2 = arith.constant 4096 : i64 + %c1024_i32_m2 = arith.constant 1024 : i32 + + %ub_in_m2 = pto.castptr %c0_i64_m2 : i64 -> !pto.ptr + %ub_out_m2 = pto.castptr %c4096_i64_m2 : i64 -> !pto.ptr + + %false_m2 = arith.constant false + pto.mte_gm_ub %arg4, %ub_in_m2, %c0_i64_m2, %c128_i64_m2 + nburst(%c32_i64_m2, %c128_i64_m2, %c128_i64_m2) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %__m2:1 = scf.for %offset_m2 = %c0_m2 to %c1024_m2 step %c64_m2 iter_args(%remaining_m2 = %c1024_i32_m2) -> (i32) { + %mask_m2, %next_remaining_m2 = pto.plt_b32 %remaining_m2 : i32 -> !pto.mask, i32 + %vec_m2 = pto.vlds %ub_in_m2[%offset_m2] : !pto.ptr -> !pto.vreg<64xf32> + %out_m2 = pto.vabs %vec_m2, %mask_m2 : !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> + pto.vsts %out_m2, %ub_out_m2[%offset_m2], %mask_m2 : !pto.vreg<64xf32>, !pto.ptr, !pto.mask + scf.yield %next_remaining_m2 : i32 + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m2, %arg5, %c128_i64_m2 + nburst(%c32_i64_m2, %c128_i64_m2, %c128_i64_m2) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + + } + // inactive merged from vabs_i16_signed_overflow_edge_kernel + scf.if %__deep_merge_guard { + + %c0_m3 = arith.constant 0 : index + %c128_m3 = arith.constant 128 : index + %c1024_m3 = arith.constant 1024 : index + %c0_i64_m3 = arith.constant 0 : i64 + %c1_i64_m3 = arith.constant 1 : i64 + %c32_i64_m3 = arith.constant 32 : i64 + %c64_i64_m3 = arith.constant 64 : i64 + %c2048_i64_m3 = arith.constant 2048 : i64 + %c1024_i32_m3 = arith.constant 1024 : i32 + + %ub_in_m3 = pto.castptr %c0_i64_m3 : i64 -> !pto.ptr + %ub_out_m3 = pto.castptr %c2048_i64_m3 : i64 -> !pto.ptr + + %false_m3 = arith.constant false + pto.mte_gm_ub %arg6, %ub_in_m3, %c0_i64_m3, %c64_i64_m3 + nburst(%c32_i64_m3, %c64_i64_m3, %c64_i64_m3) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %__m3:1 = scf.for %offset_m3 = %c0_m3 to %c1024_m3 step %c128_m3 iter_args(%remaining_m3 = %c1024_i32_m3) -> (i32) { + %mask_m3, %next_remaining_m3 = pto.plt_b16 %remaining_m3 : i32 -> !pto.mask, i32 + %vec_m3 = pto.vlds %ub_in_m3[%offset_m3] : !pto.ptr -> !pto.vreg<128xi16> + %out_m3 = pto.vabs %vec_m3, %mask_m3 : !pto.vreg<128xi16>, !pto.mask -> !pto.vreg<128xi16> + pto.vsts %out_m3, %ub_out_m3[%offset_m3], %mask_m3 : !pto.vreg<128xi16>, !pto.ptr, !pto.mask + scf.yield %next_remaining_m3 : i32 + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m3, %arg7, %c64_i64_m3 + nburst(%c32_i64_m3, %c64_i64_m3, %c64_i64_m3) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + + } + // inactive merged from vabs_kernel_2d_vabs_i16_unsigned + scf.if %__deep_merge_guard { + + %c0_m4 = arith.constant 0 : index + %c1_m4 = arith.constant 1 : index + %c64_m4 = arith.constant 64 : index + %c1024_m4 = arith.constant 1024 : index + %c0_i64_m4 = arith.constant 0 : i64 + %c1_i64_m4 = arith.constant 1 : i64 + %c32_i64_m4 = arith.constant 32 : i64 + %c128_i64_m4 = arith.constant 128 : i64 + %c4096_i64_m4 = arith.constant 4096 : i64 + %c1024_i32_m4 = arith.constant 1024 : i32 + + %ub_in_m4 = pto.castptr %c0_i64_m4 : i64 -> !pto.ptr + %ub_out_m4 = pto.castptr %c4096_i64_m4 : i64 -> !pto.ptr + + %false_m4 = arith.constant false + pto.mte_gm_ub %arg8, %ub_in_m4, %c0_i64_m4, %c128_i64_m4 + nburst(%c32_i64_m4, %c128_i64_m4, %c128_i64_m4) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %__m4:1 = scf.for %offset_m4 = %c0_m4 to %c1024_m4 step %c64_m4 iter_args(%remaining_m4 = %c1024_i32_m4) -> (i32) { + %mask_m4, %next_remaining_m4 = pto.plt_b32 %remaining_m4 : i32 -> !pto.mask, i32 + %vec_m4 = pto.vlds %ub_in_m4[%offset_m4] : !pto.ptr -> !pto.vreg<64xf32> + %out_m4 = pto.vabs %vec_m4, %mask_m4 : !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> + pto.vsts %out_m4, %ub_out_m4[%offset_m4], %mask_m4 : !pto.vreg<64xf32>, !pto.ptr, !pto.mask + scf.yield %next_remaining_m4 : i32 + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m4, %arg9, %c128_i64_m4 + nburst(%c32_i64_m4, %c128_i64_m4, %c128_i64_m4) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + + } + // inactive merged from vabs_loop_carried_vreg_kernel_2d + scf.if %__deep_merge_guard { + + %c0_m5 = arith.constant 0 : index + %c1_m5 = arith.constant 1 : index + %c2_m5 = arith.constant 2 : index + %c64_m5 = arith.constant 64 : index + %c1000_m5 = arith.constant 1000 : index + %c1024_m5 = arith.constant 1024 : index + %c0_i64_m5 = arith.constant 0 : i64 + %c1_i64_m5 = arith.constant 1 : i64 + %c32_i64_m5 = arith.constant 32 : i64 + %c128_i64_m5 = arith.constant 128 : i64 + %c4096_i64_m5 = arith.constant 4096 : i64 + %c1000_i32_m5 = arith.constant 1000 : i32 + + %ub_in_m5 = pto.castptr %c0_i64_m5 : i64 -> !pto.ptr + %ub_out_m5 = pto.castptr %c4096_i64_m5 : i64 -> !pto.ptr + + %false_m5 = arith.constant false + pto.mte_gm_ub %arg10, %ub_in_m5, %c0_i64_m5, %c128_i64_m5 + nburst(%c32_i64_m5, %c128_i64_m5, %c128_i64_m5) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %__m5:1 = scf.for %offset_m5 = %c0_m5 to %c1000_m5 step %c64_m5 iter_args(%remaining_m5 = %c1000_i32_m5) -> (i32) { + %mask_m5, %next_remaining_m5 = pto.plt_b32 %remaining_m5 : i32 -> !pto.mask, i32 + %vec_m5 = pto.vlds %ub_in_m5[%offset_m5] : !pto.ptr -> !pto.vreg<64xf32> + %out_m5 = scf.for %iter_m5 = %c0_m5 to %c2_m5 step %c1_m5 + iter_args(%carry_m5 = %vec_m5) -> (!pto.vreg<64xf32>) { + %abs_m5 = pto.vabs %carry_m5, %mask_m5 : !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> + scf.yield %abs_m5 : !pto.vreg<64xf32> + } + pto.vsts %out_m5, %ub_out_m5[%offset_m5], %mask_m5 : !pto.vreg<64xf32>, !pto.ptr, !pto.mask + scf.yield %next_remaining_m5 : i32 + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m5, %arg11, %c128_i64_m5 + nburst(%c32_i64_m5, %c128_i64_m5, %c128_i64_m5) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + + } + // inactive merged from vabs_tail_kernel_2d + scf.if %__deep_merge_guard { + + %c0_m6 = arith.constant 0 : index + %c64_m6 = arith.constant 64 : index + %c1024_m6 = arith.constant 1024 : index + %c0_i64_m6 = arith.constant 0 : i64 + %c1_i64_m6 = arith.constant 1 : i64 + %c32_i64_m6 = arith.constant 32 : i64 + %c128_i64_m6 = arith.constant 128 : i64 + %c4096_i64_m6 = arith.constant 4096 : i64 + %c1000_i32_m6 = arith.constant 1000 : i32 + + %ub_in_m6 = pto.castptr %c0_i64_m6 : i64 -> !pto.ptr + %ub_out_m6 = pto.castptr %c4096_i64_m6 : i64 -> !pto.ptr + + %false_m6 = arith.constant false + pto.mte_gm_ub %arg12, %ub_in_m6, %c0_i64_m6, %c128_i64_m6 + nburst(%c32_i64_m6, %c128_i64_m6, %c128_i64_m6) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %__m6:1 = scf.for %offset_m6 = %c0_m6 to %c1024_m6 step %c64_m6 iter_args(%remaining_m6 = %c1000_i32_m6) -> (i32) { + %mask_m6, %next_remaining_m6 = pto.plt_b32 %remaining_m6 : i32 -> !pto.mask, i32 + %vec_m6 = pto.vlds %ub_in_m6[%offset_m6] : !pto.ptr -> !pto.vreg<64xf32> + %out_m6 = pto.vabs %vec_m6, %mask_m6 : !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> + pto.vsts %out_m6, %ub_out_m6[%offset_m6], %mask_m6 : !pto.vreg<64xf32>, !pto.ptr, !pto.mask + scf.yield %next_remaining_m6 : i32 + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m6, %arg13, %c128_i64_m6 + nburst(%c32_i64_m6, %c128_i64_m6, %c128_i64_m6) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + + } + // active merged from vabs_kernel_2d + + %c0_m7 = arith.constant 0 : index + %c1_m7 = arith.constant 1 : index + %c64_m7 = arith.constant 64 : index + %c1024_m7 = arith.constant 1024 : index + %c0_i64_m7 = arith.constant 0 : i64 + %c1_i64_m7 = arith.constant 1 : i64 + %c32_i64_m7 = arith.constant 32 : i64 + %c128_i64_m7 = arith.constant 128 : i64 + %c4096_i64_m7 = arith.constant 4096 : i64 + %c1024_i32_m7 = arith.constant 1024 : i32 + + %ub_in_m7 = pto.castptr %c0_i64_m7 : i64 -> !pto.ptr + %ub_out_m7 = pto.castptr %c4096_i64_m7 : i64 -> !pto.ptr + + %false_m7 = arith.constant false + pto.mte_gm_ub %arg14, %ub_in_m7, %c0_i64_m7, %c128_i64_m7 + nburst(%c32_i64_m7, %c128_i64_m7, %c128_i64_m7) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %__m7:1 = scf.for %offset_m7 = %c0_m7 to %c1024_m7 step %c64_m7 iter_args(%remaining_m7 = %c1024_i32_m7) -> (i32) { + %mask_m7, %next_remaining_m7 = pto.plt_b32 %remaining_m7 : i32 -> !pto.mask, i32 + %vec_m7 = pto.vlds %ub_in_m7[%offset_m7] : !pto.ptr -> !pto.vreg<64xf32> + %out_m7 = pto.vabs %vec_m7, %mask_m7 : !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> + pto.vsts %out_m7, %ub_out_m7[%offset_m7], %mask_m7 : !pto.vreg<64xf32>, !pto.ptr, !pto.mask + scf.yield %next_remaining_m7 : i32 + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m7, %arg15, %c128_i64_m7 + nburst(%c32_i64_m7, %c128_i64_m7, %c128_i64_m7) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + return } } diff --git a/test/vpto/cases/micro-op/unary-vector/vabs/launch.cpp b/test/vpto/cases/micro-op/unary-vector/vabs/launch.cpp index 9002bcd67..9a5cb828f 100644 --- a/test/vpto/cases/micro-op/unary-vector/vabs/launch.cpp +++ b/test/vpto/cases/micro-op/unary-vector/vabs/launch.cpp @@ -5,22 +5,9 @@ // THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, // INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. // See LICENSE in the root of the software repository for the full text of the License. - -// --------------------------------------------------------------------------- -// PTOAS compatibility layer -// -// The upstream pto-isa headers reference some FP8/FP4 types and the -// __VEC_SCOPE__ marker that are not available on every AICore arch/toolchain -// combination (e.g. __NPU_ARCH__==2201). -// -// For our PTOAS-generated kernels we don't rely on these types today, but the -// headers still mention them in templates/static_asserts. Provide minimal -// fallbacks to keep compilation working on dav-c220. -// --------------------------------------------------------------------------- #ifndef __VEC_SCOPE__ #define __VEC_SCOPE__ #endif - #if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) typedef struct { unsigned char v; } hifloat8_t; typedef struct { unsigned char v; } float8_e4m3_t; @@ -30,33 +17,50 @@ typedef struct { unsigned char v; } float4_e1m2x2_t; typedef struct { unsigned char v; } float4_e2m1x2_t; #endif #include - -// AICore printf support is gated behind `--cce-enable-print` on some -// toolchains. When enabled, include the CCE print header so `cce::printf` -// resolves in device compilation. #if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) #include #endif - -// Some PTO-ISA types are only available in the __CCE_AICORE__ compilation -// path, but `bisheng -xcce` still performs a host-side parse pass. -// Provide minimal fallbacks only when the corresponding header wasn't -// pulled in by the selected arch implementation. #if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; +struct MrgSortExecutedNumList { uint16_t mrgSortList0,mrgSortList1,mrgSortList2,mrgSortList3; }; #endif #ifndef __CPU_SIM #include "acl/acl.h" #endif -extern "C" __global__ [aicore] void vabs_kernel_2d(__gm__ float *v1, - __gm__ float *v2); +extern "C" __global__ [aicore] void vabs_deep_merged_kernel( + __gm__ float * arg0, + __gm__ float * arg1, + __gm__ float * arg2, + __gm__ float * arg3, + __gm__ float * arg4, + __gm__ float * arg5, + __gm__ int16_t * arg6, + __gm__ int16_t * arg7, + __gm__ float * arg8, + __gm__ float * arg9, + __gm__ float * arg10, + __gm__ float * arg11, + __gm__ float * arg12, + __gm__ float * arg13, + __gm__ float * arg14, + __gm__ float * arg15); -void LaunchVabs_kernel_2d(float *v1, float *v2, void *stream) { - vabs_kernel_2d<<<1, nullptr, stream>>>((__gm__ float *)v1, (__gm__ float *)v2); +void LaunchVabsDeepMerged(float * p0, float * p1, void *stream) { + vabs_deep_merged_kernel<<<1, nullptr, stream>>>( + (__gm__ float *)p0, + (__gm__ float *)p0, + (__gm__ float *)p0, + (__gm__ float *)p0, + (__gm__ float *)p0, + (__gm__ float *)p0, + (__gm__ int16_t *)p0, + (__gm__ int16_t *)p0, + (__gm__ float *)p0, + (__gm__ float *)p0, + (__gm__ float *)p0, + (__gm__ float *)p0, + (__gm__ float *)p0, + (__gm__ float *)p0, + (__gm__ float *)p0, + (__gm__ float *)p1); } diff --git a/test/vpto/cases/micro-op/unary-vector/vabs/main.cpp b/test/vpto/cases/micro-op/unary-vector/vabs/main.cpp index 29454461f..04bb724f2 100644 --- a/test/vpto/cases/micro-op/unary-vector/vabs/main.cpp +++ b/test/vpto/cases/micro-op/unary-vector/vabs/main.cpp @@ -47,8 +47,8 @@ struct MrgSortExecutedNumList { } \ } while (0) -void LaunchVabs_kernel_2d(float *v1, float *v2, void *stream); +void LaunchVabsDeepMerged(float * p0, float * p1, void *stream); int main() { size_t elemCount_v1 = 1024; size_t fileSize_v1 = elemCount_v1 * sizeof(float); @@ -83,7 +83,7 @@ int main() { ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, ACL_MEMCPY_HOST_TO_DEVICE)); ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVabs_kernel_2d(v1Device, v2Device, stream); + LaunchVabsDeepMerged(v1Device, v2Device, stream); ACL_CHECK(aclrtSynchronizeStream(stream)); ACL_CHECK(aclrtMemcpy(v2Host, fileSize_v2, v2Device, fileSize_v2, ACL_MEMCPY_DEVICE_TO_HOST)); diff --git a/test/vpto/cases/micro-op/unary-vector/vexp-f16/compare.py b/test/vpto/cases/micro-op/unary-vector/vexp-f16/compare.py deleted file mode 100755 index 1971de729..000000000 --- a/test/vpto/cases/micro-op/unary-vector/vexp-f16/compare.py +++ /dev/null @@ -1,209 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/unary-vector/vexp-f16 -# family: unary-vector -# target_ops: pto.vexp -# scenarios: core-f16, full-mask -# NOTE: f16 vector exp baseline. -# coding=utf-8 - -import os -import sys -import numpy as np - - -REPEAT_BYTES = 256 - - -def _ceil_div(x, y): - return (x + y - 1) // y - - -def _packed_pred_storage_bytes(logical_elems, src_elem_bytes): - logical_elems = int(logical_elems) - src_elem_bytes = int(src_elem_bytes) - if logical_elems <= 0: - raise ValueError(f"logical_elems must be > 0, got {logical_elems}") - if src_elem_bytes not in (1, 2, 4): - raise ValueError(f"unsupported packed predicate source size: {src_elem_bytes}") - - repeat_elems = REPEAT_BYTES // src_elem_bytes - if src_elem_bytes == 4: - repeat_times = _ceil_div(logical_elems, repeat_elems) + 1 - loop_count = repeat_times // 2 - return loop_count * 16 - - repeat_times = _ceil_div(logical_elems, repeat_elems) - return repeat_times * (repeat_elems // 8) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - g = golden.astype(np.int64, copy=False) - o = output.astype(np.int64, copy=False) - else: - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - abs_diff = np.abs(g - o) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={g[idx]}, out={o[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_bin_prefix(golden_path, output_path, dtype, eps, count): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - try: - count = int(count) - except Exception: - print(f"[ERROR] Invalid prefix count: {count}") - return False - if count <= 0: - print(f"[ERROR] Invalid prefix count: {count}") - return False - - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np, count=count) - output = np.fromfile(output_path, dtype=dtype_np, count=count) - - if golden.size != count or output.size != count: - print( - f"[ERROR] Prefix read too small: need={count} elems, " - f"golden={golden.size}, out={output.size}" - ) - return False - - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - g = golden.astype(np.int64, copy=False) - o = output.astype(np.int64, copy=False) - else: - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - abs_diff = np.abs(g - o) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch (prefix): {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={g[idx]}, out={o[idx]}, dtype={dtype_np}, count={count})" - ) - else: - print(f"[ERROR] Mismatch (prefix): {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, logical_elems, src_elem_bytes): - """ - Compare outputs of pto.tcmp / pto.tcmps. - - PTO-ISA stores packed predicate results as a linear PK byte stream via - `psts`, with the exact written prefix length determined by the typed - TCMP/TCMPS repeat schedule. Compare only that semantic prefix. - """ - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - try: - logical_elems = int(logical_elems) - src_elem_bytes = int(src_elem_bytes) - except Exception: - print( - "[ERROR] Invalid packed mask compare arguments: " - f"logical_elems={logical_elems} src_elem_bytes={src_elem_bytes}" - ) - return False - if logical_elems <= 0 or src_elem_bytes <= 0: - print( - "[ERROR] Invalid packed mask compare arguments: " - f"logical_elems={logical_elems} src_elem_bytes={src_elem_bytes}" - ) - return False - - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - try: - prefix_bytes = _packed_pred_storage_bytes(logical_elems, src_elem_bytes) - except ValueError as exc: - print(f"[ERROR] {exc}") - return False - - if golden.size < prefix_bytes or output.size < prefix_bytes: - print( - f"[ERROR] Packed mask buffer too small: need={prefix_bytes} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - - golden_sel = golden[:prefix_bytes] - output_sel = output[:prefix_bytes] - - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, " - f"logical_elems={logical_elems}, src_elem_bytes={src_elem_bytes}, prefix_bytes={prefix_bytes})" - ) - return False - return True - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = True - ok = compare_bin("golden_v2.bin", "v2.bin", np.float16, 0.01) and ok - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/unary-vector/vexp-f16/golden.py b/test/vpto/cases/micro-op/unary-vector/vexp-f16/golden.py deleted file mode 100755 index aa2de48ca..000000000 --- a/test/vpto/cases/micro-op/unary-vector/vexp-f16/golden.py +++ /dev/null @@ -1,51 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/unary-vector/vexp-f16 -# family: unary-vector -# target_ops: pto.vexp -# scenarios: core-f16, full-mask -# NOTE: f16 vector exp baseline. -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -ROWS = 32 -COLS = 32 -SEED = 19 - - -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) - v1 = rng.uniform(-4.0, 4.0, size=(ROWS, COLS)).astype(np.float16) - v2 = np.zeros((ROWS, COLS), dtype=np.float16) - golden_v2 = np.exp(v1.astype(np.float32)).astype(np.float16) - - output_dir.mkdir(parents=True, exist_ok=True) - v1.reshape(-1).tofile(output_dir / "v1.bin") - v2.reshape(-1).tofile(output_dir / "v2.bin") - golden_v2.reshape(-1).tofile(output_dir / "golden_v2.bin") - - -def main() -> None: - parser = argparse.ArgumentParser( - description="Generate numpy-based inputs/golden for VPTO micro-op vexp f16 validation." - ) - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/unary-vector/vexp-f16/kernel.pto b/test/vpto/cases/micro-op/unary-vector/vexp-f16/kernel.pto deleted file mode 100644 index 7bd07d6e9..000000000 --- a/test/vpto/cases/micro-op/unary-vector/vexp-f16/kernel.pto +++ /dev/null @@ -1,50 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/unary-vector/vexp-f16 -// family: unary-vector -// target_ops: pto.vexp -// scenarios: core-f16, full-mask -// NOTE: f16 vector exp baseline. -// ----------------------------------------------------------------------------- -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vexp_f16_kernel_2d(%arg0: !pto.ptr, %arg1: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c128 = arith.constant 128 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c128_i64 = arith.constant 128 : i64 - %c2048_i64 = arith.constant 2048 : i64 - %c1024_i32 = arith.constant 1024 : i32 - - %ub_in = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c2048_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_in, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %_:1 = scf.for %offset = %c0 to %c1024 step %c128 iter_args(%remaining = %c1024_i32) -> (i32) { - %mask, %next_remaining = pto.plt_b16 %remaining : i32 -> !pto.mask, i32 - %vec = pto.vlds %ub_in[%offset] : !pto.ptr -> !pto.vreg<128xf16> - %out = pto.vexp %vec, %mask : !pto.vreg<128xf16>, !pto.mask -> !pto.vreg<128xf16> - pto.vsts %out, %ub_out[%offset], %mask : !pto.vreg<128xf16>, !pto.ptr, !pto.mask - scf.yield %next_remaining : i32 - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg1, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/unary-vector/vexp-f16/launch.cpp b/test/vpto/cases/micro-op/unary-vector/vexp-f16/launch.cpp deleted file mode 100644 index 4530d8cea..000000000 --- a/test/vpto/cases/micro-op/unary-vector/vexp-f16/launch.cpp +++ /dev/null @@ -1,71 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/unary-vector/vexp-f16 -// family: unary-vector -// target_ops: pto.vexp -// scenarios: core-f16, full-mask -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -// --------------------------------------------------------------------------- -// PTOAS compatibility layer -// -// The upstream pto-isa headers reference some FP8/FP4 types and the -// __VEC_SCOPE__ marker that are not available on every AICore arch/toolchain -// combination (e.g. __NPU_ARCH__==2201). -// -// For our PTOAS-generated kernels we don't rely on these types today, but the -// headers still mention them in templates/static_asserts. Provide minimal -// fallbacks to keep compilation working on dav-c220. -// --------------------------------------------------------------------------- -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include - -// AICore printf support is gated behind `--cce-enable-print` on some -// toolchains. When enabled, include the CCE print header so `cce::printf` -// resolves in device compilation. -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -// Some PTO-ISA types are only available in the __CCE_AICORE__ compilation -// path, but `bisheng -xcce` still performs a host-side parse pass. -// Provide minimal fallbacks only when the corresponding header wasn't -// pulled in by the selected arch implementation. -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vexp_f16_kernel_2d(__gm__ float *v1, - __gm__ float *v2); - -void LaunchVexp_f16_kernel_2d(float *v1, float *v2, void *stream) { - vexp_f16_kernel_2d<<<1, nullptr, stream>>>((__gm__ float *)v1, - (__gm__ float *)v2); -} diff --git a/test/vpto/cases/micro-op/unary-vector/vexp-f16/main.cpp b/test/vpto/cases/micro-op/unary-vector/vexp-f16/main.cpp deleted file mode 100644 index c41afb75e..000000000 --- a/test/vpto/cases/micro-op/unary-vector/vexp-f16/main.cpp +++ /dev/null @@ -1,130 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/unary-vector/vexp-f16 -// family: unary-vector -// target_ops: pto.vexp -// scenarios: core-f16, full-mask -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -This program is free software, you can redistribute it and/or modify it under the terms and conditions of -CANN Open Software License Agreement Version 2.0 (the "License"). -Please refer to the License for details. You may not use this file except in compliance with the License. -THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -See LICENSE in the root of the software repository for the full text of the License. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include -#include - -using namespace PtoTestCommon; - -#ifndef TMRGSORT_HPP -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, (int)_ret, __FILE__, __LINE__); \ - const char *_recent = aclGetRecentErrMsg(); \ - if (_recent != nullptr && _recent[0] != '\0') { \ - std::fprintf(stderr, "[ERROR] RecentErrMsg: %s\n", _recent); \ - } \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVexp_f16_kernel_2d(float *v1, float *v2, void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(float); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(float); - float *v1Host = nullptr; - float *v1Device = nullptr; - float *v2Host = nullptr; - float *v2Device = nullptr; - - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { - deviceId = std::atoi(envDevice); - } - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVexp_f16_kernel_2d(v1Device, v2Device, stream); - - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v2Host, fileSize_v2, v2Device, fileSize_v2, ACL_MEMCPY_DEVICE_TO_HOST)); - - WriteFile("./v2.bin", v2Host, fileSize_v2); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - if (stream != nullptr) { - const aclError _ret = aclrtDestroyStream(stream); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtDestroyStream(stream)", (int)_ret, __FILE__, __LINE__); - } - stream = nullptr; - } - if (deviceSet) { - const aclError _ret = aclrtResetDevice(deviceId); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtResetDevice(deviceId)", (int)_ret, __FILE__, __LINE__); - } - } - if (aclInited) { - const aclError _ret = aclFinalize(); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclFinalize()", (int)_ret, __FILE__, __LINE__); - } - } - - return rc; -} diff --git a/test/vpto/cases/micro-op/unary-vector/vexp-f32-exceptional/compare.py b/test/vpto/cases/micro-op/unary-vector/vexp-f32-exceptional/compare.py deleted file mode 100644 index 962985a24..000000000 --- a/test/vpto/cases/micro-op/unary-vector/vexp-f32-exceptional/compare.py +++ /dev/null @@ -1,204 +0,0 @@ -#!/usr/bin/python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# coding=utf-8 - -import os -import sys -import numpy as np - - -REPEAT_BYTES = 256 - - -def _ceil_div(x, y): - return (x + y - 1) // y - - -def _packed_pred_storage_bytes(logical_elems, src_elem_bytes): - logical_elems = int(logical_elems) - src_elem_bytes = int(src_elem_bytes) - if logical_elems <= 0: - raise ValueError(f"logical_elems must be > 0, got {logical_elems}") - if src_elem_bytes not in (1, 2, 4): - raise ValueError(f"unsupported packed predicate source size: {src_elem_bytes}") - - repeat_elems = REPEAT_BYTES // src_elem_bytes - if src_elem_bytes == 4: - repeat_times = _ceil_div(logical_elems, repeat_elems) + 1 - loop_count = repeat_times // 2 - return loop_count * 16 - - repeat_times = _ceil_div(logical_elems, repeat_elems) - return repeat_times * (repeat_elems // 8) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - g = golden.astype(np.int64, copy=False) - o = output.astype(np.int64, copy=False) - else: - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - abs_diff = np.abs(g - o) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={g[idx]}, out={o[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_bin_prefix(golden_path, output_path, dtype, eps, count): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - try: - count = int(count) - except Exception: - print(f"[ERROR] Invalid prefix count: {count}") - return False - if count <= 0: - print(f"[ERROR] Invalid prefix count: {count}") - return False - - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np, count=count) - output = np.fromfile(output_path, dtype=dtype_np, count=count) - - if golden.size != count or output.size != count: - print( - f"[ERROR] Prefix read too small: need={count} elems, " - f"golden={golden.size}, out={output.size}" - ) - return False - - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - g = golden.astype(np.int64, copy=False) - o = output.astype(np.int64, copy=False) - else: - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - abs_diff = np.abs(g - o) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch (prefix): {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={g[idx]}, out={o[idx]}, dtype={dtype_np}, count={count})" - ) - else: - print(f"[ERROR] Mismatch (prefix): {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, logical_elems, src_elem_bytes): - """ - Compare outputs of pto.tcmp / pto.tcmps. - - PTO-ISA stores packed predicate results as a linear PK byte stream via - `psts`, with the exact written prefix length determined by the typed - TCMP/TCMPS repeat schedule. Compare only that semantic prefix. - """ - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - try: - logical_elems = int(logical_elems) - src_elem_bytes = int(src_elem_bytes) - except Exception: - print( - "[ERROR] Invalid packed mask compare arguments: " - f"logical_elems={logical_elems} src_elem_bytes={src_elem_bytes}" - ) - return False - if logical_elems <= 0 or src_elem_bytes <= 0: - print( - "[ERROR] Invalid packed mask compare arguments: " - f"logical_elems={logical_elems} src_elem_bytes={src_elem_bytes}" - ) - return False - - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - try: - prefix_bytes = _packed_pred_storage_bytes(logical_elems, src_elem_bytes) - except ValueError as exc: - print(f"[ERROR] {exc}") - return False - - if golden.size < prefix_bytes or output.size < prefix_bytes: - print( - f"[ERROR] Packed mask buffer too small: need={prefix_bytes} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - - golden_sel = golden[:prefix_bytes] - output_sel = output[:prefix_bytes] - - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, " - f"logical_elems={logical_elems}, src_elem_bytes={src_elem_bytes}, prefix_bytes={prefix_bytes})" - ) - return False - return True - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = True - ok = compare_bin("golden_v2.bin", "v2.bin", np.float32, 0.0001) and ok - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/unary-vector/vexp-f32-exceptional/golden.py b/test/vpto/cases/micro-op/unary-vector/vexp-f32-exceptional/golden.py deleted file mode 100644 index fd76b39a9..000000000 --- a/test/vpto/cases/micro-op/unary-vector/vexp-f32-exceptional/golden.py +++ /dev/null @@ -1,48 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -ROWS = 32 -COLS = 32 -SEED = 19 - - -def generate(output_dir: Path, seed: int) -> None: - del seed - specials = np.array( - [-np.inf, -1.0, -0.0, 0.0, 1.0, np.inf, np.nan, 3.5], - dtype=np.float32, - ) - v1 = np.resize(specials, ROWS * COLS).reshape(ROWS, COLS).astype(np.float32) - v2 = np.zeros((ROWS, COLS), dtype=np.float32) - golden_v2 = np.exp(v1).astype(np.float32, copy=False) - - output_dir.mkdir(parents=True, exist_ok=True) - v1.reshape(-1).tofile(output_dir / "v1.bin") - v2.reshape(-1).tofile(output_dir / "v2.bin") - golden_v2.reshape(-1).tofile(output_dir / "golden_v2.bin") - - -def main() -> None: - parser = argparse.ArgumentParser() - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/unary-vector/vexp-f32-exceptional/kernel.pto b/test/vpto/cases/micro-op/unary-vector/vexp-f32-exceptional/kernel.pto deleted file mode 100644 index d98e71ba8..000000000 --- a/test/vpto/cases/micro-op/unary-vector/vexp-f32-exceptional/kernel.pto +++ /dev/null @@ -1,42 +0,0 @@ -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vexp_f32_exceptional_kernel_2d(%arg0: !pto.ptr, %arg1: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c64 = arith.constant 64 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c128_i64 = arith.constant 128 : i64 - %c4096_i64 = arith.constant 4096 : i64 - %c1024_i32 = arith.constant 1024 : i32 - - %ub_in = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c4096_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_in, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %_:1 = scf.for %offset = %c0 to %c1024 step %c64 iter_args(%remaining = %c1024_i32) -> (i32) { - %mask, %next_remaining = pto.plt_b32 %remaining : i32 -> !pto.mask, i32 - %vec = pto.vlds %ub_in[%offset] : !pto.ptr -> !pto.vreg<64xf32> - %out = pto.vexp %vec, %mask : !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> - pto.vsts %out, %ub_out[%offset], %mask : !pto.vreg<64xf32>, !pto.ptr, !pto.mask - scf.yield %next_remaining : i32 - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg1, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/unary-vector/vexp-f32-exceptional/launch.cpp b/test/vpto/cases/micro-op/unary-vector/vexp-f32-exceptional/launch.cpp deleted file mode 100644 index f96f1fc2e..000000000 --- a/test/vpto/cases/micro-op/unary-vector/vexp-f32-exceptional/launch.cpp +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vexp_f32_exceptional_kernel_2d(__gm__ float *v1, - __gm__ float *v2); - -void LaunchVexp_f32_exceptional_kernel_2d(float *v1, float *v2, void *stream) { - vexp_f32_exceptional_kernel_2d<<<1, nullptr, stream>>>((__gm__ float *)v1, - (__gm__ float *)v2); -} diff --git a/test/vpto/cases/micro-op/unary-vector/vexp-f32-exceptional/main.cpp b/test/vpto/cases/micro-op/unary-vector/vexp-f32-exceptional/main.cpp deleted file mode 100644 index 2a6824d9f..000000000 --- a/test/vpto/cases/micro-op/unary-vector/vexp-f32-exceptional/main.cpp +++ /dev/null @@ -1,83 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include - -using namespace PtoTestCommon; - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, \ - (int)_ret, __FILE__, __LINE__); \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVexp_f32_exceptional_kernel_2d(float *v1, float *v2, void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(float); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(float); - float *v1Host = nullptr; - float *v1Device = nullptr; - float *v2Host = nullptr; - float *v2Device = nullptr; - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) - deviceId = std::atoi(envDevice); - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, - ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVexp_f32_exceptional_kernel_2d(v1Device, v2Device, stream); - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v2Host, fileSize_v2, v2Device, fileSize_v2, - ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v2.bin", v2Host, fileSize_v2); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - if (stream != nullptr) - aclrtDestroyStream(stream); - if (deviceSet) - aclrtResetDevice(deviceId); - if (aclInited) - aclFinalize(); - return rc; -} diff --git a/test/vpto/cases/micro-op/unary-vector/vexp-f32-over-underflow/compare.py b/test/vpto/cases/micro-op/unary-vector/vexp-f32-over-underflow/compare.py deleted file mode 100644 index 962985a24..000000000 --- a/test/vpto/cases/micro-op/unary-vector/vexp-f32-over-underflow/compare.py +++ /dev/null @@ -1,204 +0,0 @@ -#!/usr/bin/python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# coding=utf-8 - -import os -import sys -import numpy as np - - -REPEAT_BYTES = 256 - - -def _ceil_div(x, y): - return (x + y - 1) // y - - -def _packed_pred_storage_bytes(logical_elems, src_elem_bytes): - logical_elems = int(logical_elems) - src_elem_bytes = int(src_elem_bytes) - if logical_elems <= 0: - raise ValueError(f"logical_elems must be > 0, got {logical_elems}") - if src_elem_bytes not in (1, 2, 4): - raise ValueError(f"unsupported packed predicate source size: {src_elem_bytes}") - - repeat_elems = REPEAT_BYTES // src_elem_bytes - if src_elem_bytes == 4: - repeat_times = _ceil_div(logical_elems, repeat_elems) + 1 - loop_count = repeat_times // 2 - return loop_count * 16 - - repeat_times = _ceil_div(logical_elems, repeat_elems) - return repeat_times * (repeat_elems // 8) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - g = golden.astype(np.int64, copy=False) - o = output.astype(np.int64, copy=False) - else: - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - abs_diff = np.abs(g - o) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={g[idx]}, out={o[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_bin_prefix(golden_path, output_path, dtype, eps, count): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - try: - count = int(count) - except Exception: - print(f"[ERROR] Invalid prefix count: {count}") - return False - if count <= 0: - print(f"[ERROR] Invalid prefix count: {count}") - return False - - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np, count=count) - output = np.fromfile(output_path, dtype=dtype_np, count=count) - - if golden.size != count or output.size != count: - print( - f"[ERROR] Prefix read too small: need={count} elems, " - f"golden={golden.size}, out={output.size}" - ) - return False - - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - g = golden.astype(np.int64, copy=False) - o = output.astype(np.int64, copy=False) - else: - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - abs_diff = np.abs(g - o) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch (prefix): {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={g[idx]}, out={o[idx]}, dtype={dtype_np}, count={count})" - ) - else: - print(f"[ERROR] Mismatch (prefix): {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, logical_elems, src_elem_bytes): - """ - Compare outputs of pto.tcmp / pto.tcmps. - - PTO-ISA stores packed predicate results as a linear PK byte stream via - `psts`, with the exact written prefix length determined by the typed - TCMP/TCMPS repeat schedule. Compare only that semantic prefix. - """ - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - try: - logical_elems = int(logical_elems) - src_elem_bytes = int(src_elem_bytes) - except Exception: - print( - "[ERROR] Invalid packed mask compare arguments: " - f"logical_elems={logical_elems} src_elem_bytes={src_elem_bytes}" - ) - return False - if logical_elems <= 0 or src_elem_bytes <= 0: - print( - "[ERROR] Invalid packed mask compare arguments: " - f"logical_elems={logical_elems} src_elem_bytes={src_elem_bytes}" - ) - return False - - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - try: - prefix_bytes = _packed_pred_storage_bytes(logical_elems, src_elem_bytes) - except ValueError as exc: - print(f"[ERROR] {exc}") - return False - - if golden.size < prefix_bytes or output.size < prefix_bytes: - print( - f"[ERROR] Packed mask buffer too small: need={prefix_bytes} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - - golden_sel = golden[:prefix_bytes] - output_sel = output[:prefix_bytes] - - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, " - f"logical_elems={logical_elems}, src_elem_bytes={src_elem_bytes}, prefix_bytes={prefix_bytes})" - ) - return False - return True - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = True - ok = compare_bin("golden_v2.bin", "v2.bin", np.float32, 0.0001) and ok - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/unary-vector/vexp-f32-over-underflow/golden.py b/test/vpto/cases/micro-op/unary-vector/vexp-f32-over-underflow/golden.py deleted file mode 100644 index 11cde41fe..000000000 --- a/test/vpto/cases/micro-op/unary-vector/vexp-f32-over-underflow/golden.py +++ /dev/null @@ -1,48 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -ROWS = 32 -COLS = 32 -SEED = 19 - - -def generate(output_dir: Path, seed: int) -> None: - del seed - specials = np.array( - [-120.0, -104.0, -88.0, 0.0, 40.0, 88.0, 90.0, 104.0], - dtype=np.float32, - ) - v1 = np.resize(specials, ROWS * COLS).reshape(ROWS, COLS).astype(np.float32) - v2 = np.zeros((ROWS, COLS), dtype=np.float32) - golden_v2 = np.exp(v1).astype(np.float32, copy=False) - - output_dir.mkdir(parents=True, exist_ok=True) - v1.reshape(-1).tofile(output_dir / "v1.bin") - v2.reshape(-1).tofile(output_dir / "v2.bin") - golden_v2.reshape(-1).tofile(output_dir / "golden_v2.bin") - - -def main() -> None: - parser = argparse.ArgumentParser() - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/unary-vector/vexp-f32-over-underflow/kernel.pto b/test/vpto/cases/micro-op/unary-vector/vexp-f32-over-underflow/kernel.pto deleted file mode 100644 index 8d3e0f31f..000000000 --- a/test/vpto/cases/micro-op/unary-vector/vexp-f32-over-underflow/kernel.pto +++ /dev/null @@ -1,42 +0,0 @@ -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vexp_f32_over_underflow_kernel_2d(%arg0: !pto.ptr, %arg1: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c64 = arith.constant 64 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c128_i64 = arith.constant 128 : i64 - %c4096_i64 = arith.constant 4096 : i64 - %c1024_i32 = arith.constant 1024 : i32 - - %ub_in = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c4096_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_in, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %_:1 = scf.for %offset = %c0 to %c1024 step %c64 iter_args(%remaining = %c1024_i32) -> (i32) { - %mask, %next_remaining = pto.plt_b32 %remaining : i32 -> !pto.mask, i32 - %vec = pto.vlds %ub_in[%offset] : !pto.ptr -> !pto.vreg<64xf32> - %out = pto.vexp %vec, %mask : !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> - pto.vsts %out, %ub_out[%offset], %mask : !pto.vreg<64xf32>, !pto.ptr, !pto.mask - scf.yield %next_remaining : i32 - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg1, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/unary-vector/vexp-f32-over-underflow/launch.cpp b/test/vpto/cases/micro-op/unary-vector/vexp-f32-over-underflow/launch.cpp deleted file mode 100644 index 219a407d0..000000000 --- a/test/vpto/cases/micro-op/unary-vector/vexp-f32-over-underflow/launch.cpp +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vexp_f32_over_underflow_kernel_2d(__gm__ float *v1, - __gm__ float *v2); - -void LaunchVexp_f32_exceptional_kernel_2d(float *v1, float *v2, void *stream) { - vexp_f32_over_underflow_kernel_2d<<<1, nullptr, stream>>>((__gm__ float *)v1, - (__gm__ float *)v2); -} diff --git a/test/vpto/cases/micro-op/unary-vector/vexp-f32-over-underflow/main.cpp b/test/vpto/cases/micro-op/unary-vector/vexp-f32-over-underflow/main.cpp deleted file mode 100644 index 2a6824d9f..000000000 --- a/test/vpto/cases/micro-op/unary-vector/vexp-f32-over-underflow/main.cpp +++ /dev/null @@ -1,83 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include - -using namespace PtoTestCommon; - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, \ - (int)_ret, __FILE__, __LINE__); \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVexp_f32_exceptional_kernel_2d(float *v1, float *v2, void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(float); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(float); - float *v1Host = nullptr; - float *v1Device = nullptr; - float *v2Host = nullptr; - float *v2Device = nullptr; - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) - deviceId = std::atoi(envDevice); - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, - ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVexp_f32_exceptional_kernel_2d(v1Device, v2Device, stream); - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v2Host, fileSize_v2, v2Device, fileSize_v2, - ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v2.bin", v2Host, fileSize_v2); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - if (stream != nullptr) - aclrtDestroyStream(stream); - if (deviceSet) - aclrtResetDevice(deviceId); - if (aclInited) - aclFinalize(); - return rc; -} diff --git a/test/vpto/cases/micro-op/unary-vector/vexp-tail/compare.py b/test/vpto/cases/micro-op/unary-vector/vexp-tail/compare.py deleted file mode 100644 index c13d79273..000000000 --- a/test/vpto/cases/micro-op/unary-vector/vexp-tail/compare.py +++ /dev/null @@ -1,39 +0,0 @@ -#!/usr/bin/python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - - -import os -import sys -import numpy as np - - -def compare_bin_prefix(golden_path, output_path, dtype, eps, count): - if not os.path.exists(golden_path) or not os.path.exists(output_path): - return False - golden = np.fromfile(golden_path, dtype=dtype, count=count) - output = np.fromfile(output_path, dtype=dtype, count=count) - return golden.shape == output.shape and np.allclose( - golden, output, atol=eps, rtol=eps, equal_nan=True - ) - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_bin_prefix("golden_v2.bin", "v2.bin", np.float32, 1e-4, 1000) - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/unary-vector/vexp-tail/golden.py b/test/vpto/cases/micro-op/unary-vector/vexp-tail/golden.py deleted file mode 100644 index b77b49528..000000000 --- a/test/vpto/cases/micro-op/unary-vector/vexp-tail/golden.py +++ /dev/null @@ -1,49 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -ROWS = 32 -COLS = 32 -SEED = 19 -LOGICAL_ELEMS = 1000 -OUT_SENTINEL = np.float32(-123.25) - - -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) - v1 = rng.uniform(-4.0, 4.0, size=(ROWS, COLS)).astype(np.float32) - v2 = np.full((ROWS, COLS), OUT_SENTINEL, dtype=np.float32) - golden_v2 = np.full((ROWS, COLS), OUT_SENTINEL, dtype=np.float32) - golden_v2.reshape(-1)[:LOGICAL_ELEMS] = np.exp( - v1.reshape(-1)[:LOGICAL_ELEMS] - ).astype(np.float32, copy=False) - - output_dir.mkdir(parents=True, exist_ok=True) - v1.reshape(-1).tofile(output_dir / "v1.bin") - v2.reshape(-1).tofile(output_dir / "v2.bin") - golden_v2.reshape(-1).tofile(output_dir / "golden_v2.bin") - - -def main() -> None: - parser = argparse.ArgumentParser() - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/unary-vector/vexp-tail/kernel.pto b/test/vpto/cases/micro-op/unary-vector/vexp-tail/kernel.pto deleted file mode 100644 index d8719f305..000000000 --- a/test/vpto/cases/micro-op/unary-vector/vexp-tail/kernel.pto +++ /dev/null @@ -1,42 +0,0 @@ -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vexp_tail_kernel_2d(%arg0: !pto.ptr, %arg1: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c64 = arith.constant 64 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c128_i64 = arith.constant 128 : i64 - %c4096_i64 = arith.constant 4096 : i64 - %c1000_i32 = arith.constant 1000 : i32 - - %ub_in = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c4096_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_in, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %_:1 = scf.for %offset = %c0 to %c1024 step %c64 iter_args(%remaining = %c1000_i32) -> (i32) { - %mask, %next_remaining = pto.plt_b32 %remaining : i32 -> !pto.mask, i32 - %vec = pto.vlds %ub_in[%offset] : !pto.ptr -> !pto.vreg<64xf32> - %out = pto.vexp %vec, %mask : !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> - pto.vsts %out, %ub_out[%offset], %mask : !pto.vreg<64xf32>, !pto.ptr, !pto.mask - scf.yield %next_remaining : i32 - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg1, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/unary-vector/vexp-tail/launch.cpp b/test/vpto/cases/micro-op/unary-vector/vexp-tail/launch.cpp deleted file mode 100644 index 723fee5d5..000000000 --- a/test/vpto/cases/micro-op/unary-vector/vexp-tail/launch.cpp +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vexp_tail_kernel_2d(__gm__ float *v1, - __gm__ float *v2); - -void LaunchVexp_tail_kernel_2d(float *v1, float *v2, void *stream) { - vexp_tail_kernel_2d<<<1, nullptr, stream>>>((__gm__ float *)v1, (__gm__ float *)v2); -} diff --git a/test/vpto/cases/micro-op/unary-vector/vexp-tail/main.cpp b/test/vpto/cases/micro-op/unary-vector/vexp-tail/main.cpp deleted file mode 100644 index 19f1b06f2..000000000 --- a/test/vpto/cases/micro-op/unary-vector/vexp-tail/main.cpp +++ /dev/null @@ -1,83 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include - -using namespace PtoTestCommon; - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, \ - (int)_ret, __FILE__, __LINE__); \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVexp_tail_kernel_2d(float *v1, float *v2, void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(float); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(float); - float *v1Host = nullptr; - float *v1Device = nullptr; - float *v2Host = nullptr; - float *v2Device = nullptr; - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) - deviceId = std::atoi(envDevice); - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, - ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVexp_tail_kernel_2d(v1Device, v2Device, stream); - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v2Host, fileSize_v2, v2Device, fileSize_v2, - ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v2.bin", v2Host, fileSize_v2); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - if (stream != nullptr) - aclrtDestroyStream(stream); - if (deviceSet) - aclrtResetDevice(deviceId); - if (aclInited) - aclFinalize(); - return rc; -} diff --git a/test/vpto/cases/micro-op/unary-vector/vexp/kernel.pto b/test/vpto/cases/micro-op/unary-vector/vexp/kernel.pto index b724b268c..a6dfd2bbc 100644 --- a/test/vpto/cases/micro-op/unary-vector/vexp/kernel.pto +++ b/test/vpto/cases/micro-op/unary-vector/vexp/kernel.pto @@ -1,62 +1,216 @@ -// ============================================================================= -// abs_kernel_2d: Element-wise absolute value on a 32x32 f32 tile -// ============================================================================= -// This kernel computes abs(input) for a 32x32 float32 matrix (1024 elements). -// -// Memory Layout: -// - Input: arg0 -> GM (Global Memory) -// - Output: arg1 -> GM (Global Memory) -// - UB (Unified Buffer) at offset 0: input tile (4096 bytes = 32*32*4) -// - UB at offset 4096: output tile (4096 bytes = 32*32*4) -// -// Pipeline: -// 1. DMA: GM -> UB (MTE2 pipe) - copy input tile to UB -// 2. Sync: wait for MTE2 -> V pipe handoff -// 3. Compute: vabs on 64-element vectors (V pipe) - 16 iterations for 1024 elements -// 4. Sync: wait for V -> MTE3 pipe handoff -// 5. DMA: UB -> GM (MTE3 pipe) - copy result tile back to GM -// ============================================================================= - module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vexp_kernel_2d(%arg0: !pto.ptr, %arg1: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c64 = arith.constant 64 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c128_i64 = arith.constant 128 : i64 - %c4096_i64 = arith.constant 4096 : i64 - %c1024_i32 = arith.constant 1024 : i32 - - %ub_in = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c4096_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_in, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) + func.func @vexp_deep_merged_kernel(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: !pto.ptr, %arg4: !pto.ptr, %arg5: !pto.ptr, %arg6: !pto.ptr, %arg7: !pto.ptr, %arg8: !pto.ptr, %arg9: !pto.ptr) attributes {pto.kernel} { + %__deep_merge_guard = arith.constant false + // inactive merged from vexp_f16_kernel_2d + scf.if %__deep_merge_guard { + + %c0_m0 = arith.constant 0 : index + %c1_m0 = arith.constant 1 : index + %c128_m0 = arith.constant 128 : index + %c1024_m0 = arith.constant 1024 : index + %c0_i64_m0 = arith.constant 0 : i64 + %c1_i64_m0 = arith.constant 1 : i64 + %c32_i64_m0 = arith.constant 32 : i64 + %c128_i64_m0 = arith.constant 128 : i64 + %c2048_i64_m0 = arith.constant 2048 : i64 + %c1024_i32_m0 = arith.constant 1024 : i32 + + %ub_in_m0 = pto.castptr %c0_i64_m0 : i64 -> !pto.ptr + %ub_out_m0 = pto.castptr %c2048_i64_m0 : i64 -> !pto.ptr + + %false_m0 = arith.constant false + pto.mte_gm_ub %arg0, %ub_in_m0, %c0_i64_m0, %c128_i64_m0 + nburst(%c32_i64_m0, %c128_i64_m0, %c128_i64_m0) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %__m0:1 = scf.for %offset_m0 = %c0_m0 to %c1024_m0 step %c128_m0 iter_args(%remaining_m0 = %c1024_i32_m0) -> (i32) { + %mask_m0, %next_remaining_m0 = pto.plt_b16 %remaining_m0 : i32 -> !pto.mask, i32 + %vec_m0 = pto.vlds %ub_in_m0[%offset_m0] : !pto.ptr -> !pto.vreg<128xf16> + %out_m0 = pto.vexp %vec_m0, %mask_m0 : !pto.vreg<128xf16>, !pto.mask -> !pto.vreg<128xf16> + pto.vsts %out_m0, %ub_out_m0[%offset_m0], %mask_m0 : !pto.vreg<128xf16>, !pto.ptr, !pto.mask + scf.yield %next_remaining_m0 : i32 + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m0, %arg1, %c128_i64_m0 + nburst(%c32_i64_m0, %c128_i64_m0, %c128_i64_m0) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + + } + // inactive merged from vexp_f32_exceptional_kernel_2d + scf.if %__deep_merge_guard { + + %c0_m1 = arith.constant 0 : index + %c64_m1 = arith.constant 64 : index + %c1024_m1 = arith.constant 1024 : index + %c0_i64_m1 = arith.constant 0 : i64 + %c1_i64_m1 = arith.constant 1 : i64 + %c32_i64_m1 = arith.constant 32 : i64 + %c128_i64_m1 = arith.constant 128 : i64 + %c4096_i64_m1 = arith.constant 4096 : i64 + %c1024_i32_m1 = arith.constant 1024 : i32 + + %ub_in_m1 = pto.castptr %c0_i64_m1 : i64 -> !pto.ptr + %ub_out_m1 = pto.castptr %c4096_i64_m1 : i64 -> !pto.ptr + + %false_m1 = arith.constant false + pto.mte_gm_ub %arg2, %ub_in_m1, %c0_i64_m1, %c128_i64_m1 + nburst(%c32_i64_m1, %c128_i64_m1, %c128_i64_m1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %__m1:1 = scf.for %offset_m1 = %c0_m1 to %c1024_m1 step %c64_m1 iter_args(%remaining_m1 = %c1024_i32_m1) -> (i32) { + %mask_m1, %next_remaining_m1 = pto.plt_b32 %remaining_m1 : i32 -> !pto.mask, i32 + %vec_m1 = pto.vlds %ub_in_m1[%offset_m1] : !pto.ptr -> !pto.vreg<64xf32> + %out_m1 = pto.vexp %vec_m1, %mask_m1 : !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> + pto.vsts %out_m1, %ub_out_m1[%offset_m1], %mask_m1 : !pto.vreg<64xf32>, !pto.ptr, !pto.mask + scf.yield %next_remaining_m1 : i32 + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m1, %arg3, %c128_i64_m1 + nburst(%c32_i64_m1, %c128_i64_m1, %c128_i64_m1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + + } + // inactive merged from vexp_f32_over_underflow_kernel_2d + scf.if %__deep_merge_guard { + + %c0_m2 = arith.constant 0 : index + %c64_m2 = arith.constant 64 : index + %c1024_m2 = arith.constant 1024 : index + %c0_i64_m2 = arith.constant 0 : i64 + %c1_i64_m2 = arith.constant 1 : i64 + %c32_i64_m2 = arith.constant 32 : i64 + %c128_i64_m2 = arith.constant 128 : i64 + %c4096_i64_m2 = arith.constant 4096 : i64 + %c1024_i32_m2 = arith.constant 1024 : i32 + + %ub_in_m2 = pto.castptr %c0_i64_m2 : i64 -> !pto.ptr + %ub_out_m2 = pto.castptr %c4096_i64_m2 : i64 -> !pto.ptr + + %false_m2 = arith.constant false + pto.mte_gm_ub %arg4, %ub_in_m2, %c0_i64_m2, %c128_i64_m2 + nburst(%c32_i64_m2, %c128_i64_m2, %c128_i64_m2) : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] pto.vecscope { - %_:1 = scf.for %offset = %c0 to %c1024 step %c64 iter_args(%remaining = %c1024_i32) -> (i32) { - %mask, %next_remaining = pto.plt_b32 %remaining : i32 -> !pto.mask, i32 - %vec = pto.vlds %ub_in[%offset] : !pto.ptr -> !pto.vreg<64xf32> - %out = pto.vexp %vec, %mask : !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> - pto.vsts %out, %ub_out[%offset], %mask : !pto.vreg<64xf32>, !pto.ptr, !pto.mask - scf.yield %next_remaining : i32 + %__m2:1 = scf.for %offset_m2 = %c0_m2 to %c1024_m2 step %c64_m2 iter_args(%remaining_m2 = %c1024_i32_m2) -> (i32) { + %mask_m2, %next_remaining_m2 = pto.plt_b32 %remaining_m2 : i32 -> !pto.mask, i32 + %vec_m2 = pto.vlds %ub_in_m2[%offset_m2] : !pto.ptr -> !pto.vreg<64xf32> + %out_m2 = pto.vexp %vec_m2, %mask_m2 : !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> + pto.vsts %out_m2, %ub_out_m2[%offset_m2], %mask_m2 : !pto.vreg<64xf32>, !pto.ptr, !pto.mask + scf.yield %next_remaining_m2 : i32 } } pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg1, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) + pto.mte_ub_gm %ub_out_m2, %arg5, %c128_i64_m2 + nburst(%c32_i64_m2, %c128_i64_m2, %c128_i64_m2) : !pto.ptr, !pto.ptr, i64, i64, i64, i64 pto.barrier #pto.pipe + + } + // inactive merged from vexp_tail_kernel_2d + scf.if %__deep_merge_guard { + + %c0_m3 = arith.constant 0 : index + %c64_m3 = arith.constant 64 : index + %c1024_m3 = arith.constant 1024 : index + %c0_i64_m3 = arith.constant 0 : i64 + %c1_i64_m3 = arith.constant 1 : i64 + %c32_i64_m3 = arith.constant 32 : i64 + %c128_i64_m3 = arith.constant 128 : i64 + %c4096_i64_m3 = arith.constant 4096 : i64 + %c1000_i32_m3 = arith.constant 1000 : i32 + + %ub_in_m3 = pto.castptr %c0_i64_m3 : i64 -> !pto.ptr + %ub_out_m3 = pto.castptr %c4096_i64_m3 : i64 -> !pto.ptr + + %false_m3 = arith.constant false + pto.mte_gm_ub %arg6, %ub_in_m3, %c0_i64_m3, %c128_i64_m3 + nburst(%c32_i64_m3, %c128_i64_m3, %c128_i64_m3) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %__m3:1 = scf.for %offset_m3 = %c0_m3 to %c1024_m3 step %c64_m3 iter_args(%remaining_m3 = %c1000_i32_m3) -> (i32) { + %mask_m3, %next_remaining_m3 = pto.plt_b32 %remaining_m3 : i32 -> !pto.mask, i32 + %vec_m3 = pto.vlds %ub_in_m3[%offset_m3] : !pto.ptr -> !pto.vreg<64xf32> + %out_m3 = pto.vexp %vec_m3, %mask_m3 : !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> + pto.vsts %out_m3, %ub_out_m3[%offset_m3], %mask_m3 : !pto.vreg<64xf32>, !pto.ptr, !pto.mask + scf.yield %next_remaining_m3 : i32 + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m3, %arg7, %c128_i64_m3 + nburst(%c32_i64_m3, %c128_i64_m3, %c128_i64_m3) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + + } + // active merged from vexp_kernel_2d + + %c0_m4 = arith.constant 0 : index + %c1_m4 = arith.constant 1 : index + %c64_m4 = arith.constant 64 : index + %c1024_m4 = arith.constant 1024 : index + %c0_i64_m4 = arith.constant 0 : i64 + %c1_i64_m4 = arith.constant 1 : i64 + %c32_i64_m4 = arith.constant 32 : i64 + %c128_i64_m4 = arith.constant 128 : i64 + %c4096_i64_m4 = arith.constant 4096 : i64 + %c1024_i32_m4 = arith.constant 1024 : i32 + + %ub_in_m4 = pto.castptr %c0_i64_m4 : i64 -> !pto.ptr + %ub_out_m4 = pto.castptr %c4096_i64_m4 : i64 -> !pto.ptr + + %false_m4 = arith.constant false + pto.mte_gm_ub %arg8, %ub_in_m4, %c0_i64_m4, %c128_i64_m4 + nburst(%c32_i64_m4, %c128_i64_m4, %c128_i64_m4) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %__m4:1 = scf.for %offset_m4 = %c0_m4 to %c1024_m4 step %c64_m4 iter_args(%remaining_m4 = %c1024_i32_m4) -> (i32) { + %mask_m4, %next_remaining_m4 = pto.plt_b32 %remaining_m4 : i32 -> !pto.mask, i32 + %vec_m4 = pto.vlds %ub_in_m4[%offset_m4] : !pto.ptr -> !pto.vreg<64xf32> + %out_m4 = pto.vexp %vec_m4, %mask_m4 : !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> + pto.vsts %out_m4, %ub_out_m4[%offset_m4], %mask_m4 : !pto.vreg<64xf32>, !pto.ptr, !pto.mask + scf.yield %next_remaining_m4 : i32 + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m4, %arg9, %c128_i64_m4 + nburst(%c32_i64_m4, %c128_i64_m4, %c128_i64_m4) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + return } } diff --git a/test/vpto/cases/micro-op/unary-vector/vexp/launch.cpp b/test/vpto/cases/micro-op/unary-vector/vexp/launch.cpp index b6d8cdbf0..6728fb2da 100644 --- a/test/vpto/cases/micro-op/unary-vector/vexp/launch.cpp +++ b/test/vpto/cases/micro-op/unary-vector/vexp/launch.cpp @@ -5,22 +5,9 @@ // THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, // INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. // See LICENSE in the root of the software repository for the full text of the License. - -// --------------------------------------------------------------------------- -// PTOAS compatibility layer -// -// The upstream pto-isa headers reference some FP8/FP4 types and the -// __VEC_SCOPE__ marker that are not available on every AICore arch/toolchain -// combination (e.g. __NPU_ARCH__==2201). -// -// For our PTOAS-generated kernels we don't rely on these types today, but the -// headers still mention them in templates/static_asserts. Provide minimal -// fallbacks to keep compilation working on dav-c220. -// --------------------------------------------------------------------------- #ifndef __VEC_SCOPE__ #define __VEC_SCOPE__ #endif - #if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) typedef struct { unsigned char v; } hifloat8_t; typedef struct { unsigned char v; } float8_e4m3_t; @@ -30,33 +17,38 @@ typedef struct { unsigned char v; } float4_e1m2x2_t; typedef struct { unsigned char v; } float4_e2m1x2_t; #endif #include - -// AICore printf support is gated behind `--cce-enable-print` on some -// toolchains. When enabled, include the CCE print header so `cce::printf` -// resolves in device compilation. #if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) #include #endif - -// Some PTO-ISA types are only available in the __CCE_AICORE__ compilation -// path, but `bisheng -xcce` still performs a host-side parse pass. -// Provide minimal fallbacks only when the corresponding header wasn't -// pulled in by the selected arch implementation. #if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; +struct MrgSortExecutedNumList { uint16_t mrgSortList0,mrgSortList1,mrgSortList2,mrgSortList3; }; #endif #ifndef __CPU_SIM #include "acl/acl.h" #endif -extern "C" __global__ [aicore] void vexp_kernel_2d(__gm__ float *v1, - __gm__ float *v2); +extern "C" __global__ [aicore] void vexp_deep_merged_kernel( + __gm__ half * arg0, + __gm__ half * arg1, + __gm__ float * arg2, + __gm__ float * arg3, + __gm__ float * arg4, + __gm__ float * arg5, + __gm__ float * arg6, + __gm__ float * arg7, + __gm__ float * arg8, + __gm__ float * arg9); -void LaunchVexp_kernel_2d(float *v1, float *v2, void *stream) { - vexp_kernel_2d<<<1, nullptr, stream>>>((__gm__ float *)v1, (__gm__ float *)v2); +void LaunchVexpDeepMerged(float * p0, float * p1, void *stream) { + vexp_deep_merged_kernel<<<1, nullptr, stream>>>( + (__gm__ half *)p0, + (__gm__ half *)p0, + (__gm__ float *)p0, + (__gm__ float *)p0, + (__gm__ float *)p0, + (__gm__ float *)p0, + (__gm__ float *)p0, + (__gm__ float *)p0, + (__gm__ float *)p0, + (__gm__ float *)p1); } diff --git a/test/vpto/cases/micro-op/unary-vector/vexp/main.cpp b/test/vpto/cases/micro-op/unary-vector/vexp/main.cpp index f864622ca..37ed907ff 100644 --- a/test/vpto/cases/micro-op/unary-vector/vexp/main.cpp +++ b/test/vpto/cases/micro-op/unary-vector/vexp/main.cpp @@ -47,8 +47,8 @@ struct MrgSortExecutedNumList { } \ } while (0) -void LaunchVexp_kernel_2d(float *v1, float *v2, void *stream); +void LaunchVexpDeepMerged(float * p0, float * p1, void *stream); int main() { size_t elemCount_v1 = 1024; size_t fileSize_v1 = elemCount_v1 * sizeof(float); @@ -83,7 +83,7 @@ int main() { ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, ACL_MEMCPY_HOST_TO_DEVICE)); ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVexp_kernel_2d(v1Device, v2Device, stream); + LaunchVexpDeepMerged(v1Device, v2Device, stream); ACL_CHECK(aclrtSynchronizeStream(stream)); ACL_CHECK(aclrtMemcpy(v2Host, fileSize_v2, v2Device, fileSize_v2, ACL_MEMCPY_DEVICE_TO_HOST)); diff --git a/test/vpto/cases/micro-op/unary-vector/vln-domain-boundary/compare.py b/test/vpto/cases/micro-op/unary-vector/vln-domain-boundary/compare.py deleted file mode 100755 index afee62a98..000000000 --- a/test/vpto/cases/micro-op/unary-vector/vln-domain-boundary/compare.py +++ /dev/null @@ -1,209 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/unary-vector/vln-domain-boundary -# family: unary-vector -# target_ops: pto.vln -# scenarios: core-f32, domain-positive, exceptional-values -# NOTE: bulk-generated coverage skeleton. -# coding=utf-8 - -import os -import sys -import numpy as np - - -REPEAT_BYTES = 256 - - -def _ceil_div(x, y): - return (x + y - 1) // y - - -def _packed_pred_storage_bytes(logical_elems, src_elem_bytes): - logical_elems = int(logical_elems) - src_elem_bytes = int(src_elem_bytes) - if logical_elems <= 0: - raise ValueError(f"logical_elems must be > 0, got {logical_elems}") - if src_elem_bytes not in (1, 2, 4): - raise ValueError(f"unsupported packed predicate source size: {src_elem_bytes}") - - repeat_elems = REPEAT_BYTES // src_elem_bytes - if src_elem_bytes == 4: - repeat_times = _ceil_div(logical_elems, repeat_elems) + 1 - loop_count = repeat_times // 2 - return loop_count * 16 - - repeat_times = _ceil_div(logical_elems, repeat_elems) - return repeat_times * (repeat_elems // 8) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - g = golden.astype(np.int64, copy=False) - o = output.astype(np.int64, copy=False) - else: - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - abs_diff = np.abs(g - o) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={g[idx]}, out={o[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_bin_prefix(golden_path, output_path, dtype, eps, count): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - try: - count = int(count) - except Exception: - print(f"[ERROR] Invalid prefix count: {count}") - return False - if count <= 0: - print(f"[ERROR] Invalid prefix count: {count}") - return False - - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np, count=count) - output = np.fromfile(output_path, dtype=dtype_np, count=count) - - if golden.size != count or output.size != count: - print( - f"[ERROR] Prefix read too small: need={count} elems, " - f"golden={golden.size}, out={output.size}" - ) - return False - - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - g = golden.astype(np.int64, copy=False) - o = output.astype(np.int64, copy=False) - else: - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - abs_diff = np.abs(g - o) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch (prefix): {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={g[idx]}, out={o[idx]}, dtype={dtype_np}, count={count})" - ) - else: - print(f"[ERROR] Mismatch (prefix): {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, logical_elems, src_elem_bytes): - """ - Compare outputs of pto.tcmp / pto.tcmps. - - PTO-ISA stores packed predicate results as a linear PK byte stream via - `psts`, with the exact written prefix length determined by the typed - TCMP/TCMPS repeat schedule. Compare only that semantic prefix. - """ - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - try: - logical_elems = int(logical_elems) - src_elem_bytes = int(src_elem_bytes) - except Exception: - print( - "[ERROR] Invalid packed mask compare arguments: " - f"logical_elems={logical_elems} src_elem_bytes={src_elem_bytes}" - ) - return False - if logical_elems <= 0 or src_elem_bytes <= 0: - print( - "[ERROR] Invalid packed mask compare arguments: " - f"logical_elems={logical_elems} src_elem_bytes={src_elem_bytes}" - ) - return False - - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - try: - prefix_bytes = _packed_pred_storage_bytes(logical_elems, src_elem_bytes) - except ValueError as exc: - print(f"[ERROR] {exc}") - return False - - if golden.size < prefix_bytes or output.size < prefix_bytes: - print( - f"[ERROR] Packed mask buffer too small: need={prefix_bytes} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - - golden_sel = golden[:prefix_bytes] - output_sel = output[:prefix_bytes] - - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, " - f"logical_elems={logical_elems}, src_elem_bytes={src_elem_bytes}, prefix_bytes={prefix_bytes})" - ) - return False - return True - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = True - ok = compare_bin("golden_v2.bin", "v2.bin", np.float32, 0.0001) and ok - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/unary-vector/vln-domain-boundary/golden.py b/test/vpto/cases/micro-op/unary-vector/vln-domain-boundary/golden.py deleted file mode 100755 index 64f82ec2d..000000000 --- a/test/vpto/cases/micro-op/unary-vector/vln-domain-boundary/golden.py +++ /dev/null @@ -1,65 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/unary-vector/vln-domain-boundary -# family: unary-vector -# target_ops: pto.vln -# scenarios: core-f32, domain-positive, exceptional-values -# NOTE: bulk-generated coverage skeleton. -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -ROWS = 32 -COLS = 32 -SEED = 19 - - -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) - v1 = rng.uniform(0.125, 8.0, size=(ROWS, COLS)).astype(np.float32) - flat = v1.reshape(-1) - flat[:8] = np.array( - [ - np.float32(np.finfo(np.float32).tiny), - np.float32(np.finfo(np.float32).tiny * 2.0), - np.float32(1.0), - np.float32(2.0), - np.float32(16.0), - np.float32(1024.0), - np.float32(np.finfo(np.float32).max), - np.float32(0.5), - ], - dtype=np.float32, - ) - v2 = np.zeros((ROWS, COLS), dtype=np.float32) - golden_v2 = np.log(v1).astype(np.float32, copy=False) - - output_dir.mkdir(parents=True, exist_ok=True) - v1.reshape(-1).tofile(output_dir / "v1.bin") - v2.reshape(-1).tofile(output_dir / "v2.bin") - golden_v2.reshape(-1).tofile(output_dir / "golden_v2.bin") - - -def main() -> None: - parser = argparse.ArgumentParser( - description="Generate numpy-based inputs/golden for VPTO micro-op vln domain-boundary validation." - ) - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/unary-vector/vln-domain-boundary/kernel.pto b/test/vpto/cases/micro-op/unary-vector/vln-domain-boundary/kernel.pto deleted file mode 100644 index b841980ba..000000000 --- a/test/vpto/cases/micro-op/unary-vector/vln-domain-boundary/kernel.pto +++ /dev/null @@ -1,70 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/unary-vector/vln-domain-boundary -// family: unary-vector -// target_ops: pto.vln -// scenarios: core-f32, domain-positive, exceptional-values -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -// ============================================================================= -// abs_kernel_2d: Element-wise absolute value on a 32x32 f32 tile -// ============================================================================= -// This kernel computes abs(input) for a 32x32 float32 matrix (1024 elements). -// -// Memory Layout: -// - Input: arg0 -> GM (Global Memory) -// - Output: arg1 -> GM (Global Memory) -// - UB (Unified Buffer) at offset 0: input tile (4096 bytes = 32*32*4) -// - UB at offset 4096: output tile (4096 bytes = 32*32*4) -// -// Pipeline: -// 1. DMA: GM -> UB (MTE2 pipe) - copy input tile to UB -// 2. Sync: wait for MTE2 -> V pipe handoff -// 3. Compute: vabs on 64-element vectors (V pipe) - 16 iterations for 1024 elements -// 4. Sync: wait for V -> MTE3 pipe handoff -// 5. DMA: UB -> GM (MTE3 pipe) - copy result tile back to GM -// ============================================================================= - -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vabs_kernel_2d(%arg0: !pto.ptr, %arg1: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c64 = arith.constant 64 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c128_i64 = arith.constant 128 : i64 - %c4096_i64 = arith.constant 4096 : i64 - %c1024_i32 = arith.constant 1024 : i32 - - %ub_in = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c4096_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_in, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %_:1 = scf.for %offset = %c0 to %c1024 step %c64 iter_args(%remaining = %c1024_i32) -> (i32) { - %mask, %next_remaining = pto.plt_b32 %remaining : i32 -> !pto.mask, i32 - %vec = pto.vlds %ub_in[%offset] : !pto.ptr -> !pto.vreg<64xf32> - %out = pto.vln %vec, %mask : !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> - pto.vsts %out, %ub_out[%offset], %mask : !pto.vreg<64xf32>, !pto.ptr, !pto.mask - scf.yield %next_remaining : i32 - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg1, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/unary-vector/vln-domain-boundary/launch.cpp b/test/vpto/cases/micro-op/unary-vector/vln-domain-boundary/launch.cpp deleted file mode 100644 index 6aeeded6c..000000000 --- a/test/vpto/cases/micro-op/unary-vector/vln-domain-boundary/launch.cpp +++ /dev/null @@ -1,70 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/unary-vector/vln-domain-boundary -// family: unary-vector -// target_ops: pto.vln -// scenarios: core-f32, domain-positive, exceptional-values -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -// --------------------------------------------------------------------------- -// PTOAS compatibility layer -// -// The upstream pto-isa headers reference some FP8/FP4 types and the -// __VEC_SCOPE__ marker that are not available on every AICore arch/toolchain -// combination (e.g. __NPU_ARCH__==2201). -// -// For our PTOAS-generated kernels we don't rely on these types today, but the -// headers still mention them in templates/static_asserts. Provide minimal -// fallbacks to keep compilation working on dav-c220. -// --------------------------------------------------------------------------- -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include - -// AICore printf support is gated behind `--cce-enable-print` on some -// toolchains. When enabled, include the CCE print header so `cce::printf` -// resolves in device compilation. -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -// Some PTO-ISA types are only available in the __CCE_AICORE__ compilation -// path, but `bisheng -xcce` still performs a host-side parse pass. -// Provide minimal fallbacks only when the corresponding header wasn't -// pulled in by the selected arch implementation. -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vabs_kernel_2d(__gm__ float *v1, - __gm__ float *v2); - -void LaunchVabs_kernel_2d(float *v1, float *v2, void *stream) { - vabs_kernel_2d<<<1, nullptr, stream>>>((__gm__ float *)v1, (__gm__ float *)v2); -} diff --git a/test/vpto/cases/micro-op/unary-vector/vln-domain-boundary/main.cpp b/test/vpto/cases/micro-op/unary-vector/vln-domain-boundary/main.cpp deleted file mode 100644 index ab31f79d8..000000000 --- a/test/vpto/cases/micro-op/unary-vector/vln-domain-boundary/main.cpp +++ /dev/null @@ -1,130 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/unary-vector/vln-domain-boundary -// family: unary-vector -// target_ops: pto.vln -// scenarios: core-f32, domain-positive, exceptional-values -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -This program is free software, you can redistribute it and/or modify it under the terms and conditions of -CANN Open Software License Agreement Version 2.0 (the "License"). -Please refer to the License for details. You may not use this file except in compliance with the License. -THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -See LICENSE in the root of the software repository for the full text of the License. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include -#include - -using namespace PtoTestCommon; - -#ifndef TMRGSORT_HPP -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, (int)_ret, __FILE__, __LINE__); \ - const char *_recent = aclGetRecentErrMsg(); \ - if (_recent != nullptr && _recent[0] != '\0') { \ - std::fprintf(stderr, "[ERROR] RecentErrMsg: %s\n", _recent); \ - } \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVabs_kernel_2d(float *v1, float *v2, void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(float); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(float); - float *v1Host = nullptr; - float *v1Device = nullptr; - float *v2Host = nullptr; - float *v2Device = nullptr; - - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { - deviceId = std::atoi(envDevice); - } - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVabs_kernel_2d(v1Device, v2Device, stream); - - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v2Host, fileSize_v2, v2Device, fileSize_v2, ACL_MEMCPY_DEVICE_TO_HOST)); - - WriteFile("./v2.bin", v2Host, fileSize_v2); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - if (stream != nullptr) { - const aclError _ret = aclrtDestroyStream(stream); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtDestroyStream(stream)", (int)_ret, __FILE__, __LINE__); - } - stream = nullptr; - } - if (deviceSet) { - const aclError _ret = aclrtResetDevice(deviceId); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtResetDevice(deviceId)", (int)_ret, __FILE__, __LINE__); - } - } - if (aclInited) { - const aclError _ret = aclFinalize(); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclFinalize()", (int)_ret, __FILE__, __LINE__); - } - } - - return rc; -} diff --git a/test/vpto/cases/micro-op/unary-vector/vln/kernel.pto b/test/vpto/cases/micro-op/unary-vector/vln/kernel.pto index b3105ea1f..51f921483 100644 --- a/test/vpto/cases/micro-op/unary-vector/vln/kernel.pto +++ b/test/vpto/cases/micro-op/unary-vector/vln/kernel.pto @@ -1,43 +1,269 @@ module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vexp_kernel_2d(%arg0: !pto.ptr, %arg1: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c64 = arith.constant 64 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c128_i64 = arith.constant 128 : i64 - %c4096_i64 = arith.constant 4096 : i64 - %c1024_i32 = arith.constant 1024 : i32 - - %ub_in = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c4096_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_in, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) + func.func @vln_deep_merged_kernel(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: !pto.ptr) attributes {pto.kernel} { + %__deep_merge_guard = arith.constant false + // inactive merged from vabs_kernel_2d + scf.if %__deep_merge_guard { + + %c0_m0 = arith.constant 0 : index + %c1_m0 = arith.constant 1 : index + %c64_m0 = arith.constant 64 : index + %c1024_m0 = arith.constant 1024 : index + %c0_i64_m0 = arith.constant 0 : i64 + %c1_i64_m0 = arith.constant 1 : i64 + %c32_i64_m0 = arith.constant 32 : i64 + %c128_i64_m0 = arith.constant 128 : i64 + %c4096_i64_m0 = arith.constant 4096 : i64 + %c1024_i32_m0 = arith.constant 1024 : i32 + + %ub_in_m0 = pto.castptr %c0_i64_m0 : i64 -> !pto.ptr + %ub_out_m0 = pto.castptr %c4096_i64_m0 : i64 -> !pto.ptr + + %false_m0 = arith.constant false + pto.mte_gm_ub %arg0, %ub_in_m0, %c0_i64_m0, %c128_i64_m0 + nburst(%c32_i64_m0, %c128_i64_m0, %c128_i64_m0) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %__m0:1 = scf.for %offset_m0 = %c0_m0 to %c1024_m0 step %c64_m0 iter_args(%remaining_m0 = %c1024_i32_m0) -> (i32) { + %mask_m0, %next_remaining_m0 = pto.plt_b32 %remaining_m0 : i32 -> !pto.mask, i32 + %vec_m0 = pto.vlds %ub_in_m0[%offset_m0] : !pto.ptr -> !pto.vreg<64xf32> + %out_m0 = pto.vln %vec_m0, %mask_m0 : !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> + pto.vsts %out_m0, %ub_out_m0[%offset_m0], %mask_m0 : !pto.vreg<64xf32>, !pto.ptr, !pto.mask + scf.yield %next_remaining_m0 : i32 + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m0, %arg1, %c128_i64_m0 + nburst(%c32_i64_m0, %c128_i64_m0, %c128_i64_m0) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + + } + // active merged from vexp_kernel_2d + + %c0_m1 = arith.constant 0 : index + %c1_m1 = arith.constant 1 : index + %c64_m1 = arith.constant 64 : index + %c1024_m1 = arith.constant 1024 : index + %c0_i64_m1 = arith.constant 0 : i64 + %c1_i64_m1 = arith.constant 1 : i64 + %c32_i64_m1 = arith.constant 32 : i64 + %c128_i64_m1 = arith.constant 128 : i64 + %c4096_i64_m1 = arith.constant 4096 : i64 + %c1024_i32_m1 = arith.constant 1024 : i32 + + %ub_in_m1 = pto.castptr %c0_i64_m1 : i64 -> !pto.ptr + %ub_out_m1 = pto.castptr %c4096_i64_m1 : i64 -> !pto.ptr + + %false_m1 = arith.constant false + pto.mte_gm_ub %arg2, %ub_in_m1, %c0_i64_m1, %c128_i64_m1 + nburst(%c32_i64_m1, %c128_i64_m1, %c128_i64_m1) : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] pto.vecscope { - %_:1 = scf.for %offset = %c0 to %c1024 step %c64 iter_args(%remaining = %c1024_i32) -> (i32) { - %mask, %next_remaining = pto.plt_b32 %remaining : i32 -> !pto.mask, i32 - %vec = pto.vlds %ub_in[%offset] : !pto.ptr -> !pto.vreg<64xf32> - %out = pto.vln %vec, %mask : !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> - pto.vsts %out, %ub_out[%offset], %mask : !pto.vreg<64xf32>, !pto.ptr, !pto.mask - scf.yield %next_remaining : i32 + %__m1:1 = scf.for %offset_m1 = %c0_m1 to %c1024_m1 step %c64_m1 iter_args(%remaining_m1 = %c1024_i32_m1) -> (i32) { + %mask_m1, %next_remaining_m1 = pto.plt_b32 %remaining_m1 : i32 -> !pto.mask, i32 + %vec_m1 = pto.vlds %ub_in_m1[%offset_m1] : !pto.ptr -> !pto.vreg<64xf32> + %out_m1 = pto.vln %vec_m1, %mask_m1 : !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> + pto.vsts %out_m1, %ub_out_m1[%offset_m1], %mask_m1 : !pto.vreg<64xf32>, !pto.ptr, !pto.mask + scf.yield %next_remaining_m1 : i32 } } pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg1, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) + pto.mte_ub_gm %ub_out_m1, %arg3, %c128_i64_m1 + nburst(%c32_i64_m1, %c128_i64_m1, %c128_i64_m1) : !pto.ptr, !pto.ptr, i64, i64, i64, i64 pto.barrier #pto.pipe + %__case_merge_guard = arith.constant false + + // inactive cross-case merge from micro-op/unary-vector/vneg + scf.if %__case_merge_guard { + + %__deep_merge_guard_cmg16_1 = arith.constant false + // inactive merged from vabs_kernel_2d_f32_exceptional + scf.if %__deep_merge_guard_cmg16_1 { + + %c0_m0_cmg16_1 = arith.constant 0 : index + %c1_m0_cmg16_1 = arith.constant 1 : index + %c64_m0_cmg16_1 = arith.constant 64 : index + %c1024_m0_cmg16_1 = arith.constant 1024 : index + %c0_i64_m0_cmg16_1 = arith.constant 0 : i64 + %c1_i64_m0_cmg16_1 = arith.constant 1 : i64 + %c32_i64_m0_cmg16_1 = arith.constant 32 : i64 + %c128_i64_m0_cmg16_1 = arith.constant 128 : i64 + %c4096_i64_m0_cmg16_1 = arith.constant 4096 : i64 + %c1024_i32_m0_cmg16_1 = arith.constant 1024 : i32 + + %ub_in_m0_cmg16_1 = pto.castptr %c0_i64_m0_cmg16_1 : i64 -> !pto.ptr + %ub_out_m0_cmg16_1 = pto.castptr %c4096_i64_m0_cmg16_1 : i64 -> !pto.ptr + + %false_m0_cmg16_1 = arith.constant false + pto.mte_gm_ub %arg0, %ub_in_m0_cmg16_1, %c0_i64_m0_cmg16_1, %c128_i64_m0_cmg16_1 + nburst(%c32_i64_m0_cmg16_1, %c128_i64_m0_cmg16_1, %c128_i64_m0_cmg16_1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %__m0_cmg16_1:1 = scf.for %offset_m0_cmg16_1 = %c0_m0_cmg16_1 to %c1024_m0_cmg16_1 step %c64_m0_cmg16_1 iter_args(%remaining_m0_cmg16_1 = %c1024_i32_m0_cmg16_1) -> (i32) { + %mask_m0_cmg16_1, %next_remaining_m0_cmg16_1 = pto.plt_b32 %remaining_m0_cmg16_1 : i32 -> !pto.mask, i32 + %vec_m0_cmg16_1 = pto.vlds %ub_in_m0_cmg16_1[%offset_m0_cmg16_1] : !pto.ptr -> !pto.vreg<64xf32> + %out_m0_cmg16_1 = pto.vneg %vec_m0_cmg16_1, %mask_m0_cmg16_1 : !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> + pto.vsts %out_m0_cmg16_1, %ub_out_m0_cmg16_1[%offset_m0_cmg16_1], %mask_m0_cmg16_1 : !pto.vreg<64xf32>, !pto.ptr, !pto.mask + scf.yield %next_remaining_m0_cmg16_1 : i32 + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m0_cmg16_1, %arg1, %c128_i64_m0_cmg16_1 + nburst(%c32_i64_m0_cmg16_1, %c128_i64_m0_cmg16_1, %c128_i64_m0_cmg16_1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + + } + // active merged from vabs_kernel_2d + + %c0_m1_cmg16_1 = arith.constant 0 : index + %c1_m1_cmg16_1 = arith.constant 1 : index + %c64_m1_cmg16_1 = arith.constant 64 : index + %c1024_m1_cmg16_1 = arith.constant 1024 : index + %c0_i64_m1_cmg16_1 = arith.constant 0 : i64 + %c1_i64_m1_cmg16_1 = arith.constant 1 : i64 + %c32_i64_m1_cmg16_1 = arith.constant 32 : i64 + %c128_i64_m1_cmg16_1 = arith.constant 128 : i64 + %c4096_i64_m1_cmg16_1 = arith.constant 4096 : i64 + %c1024_i32_m1_cmg16_1 = arith.constant 1024 : i32 + + %ub_in_m1_cmg16_1 = pto.castptr %c0_i64_m1_cmg16_1 : i64 -> !pto.ptr + %ub_out_m1_cmg16_1 = pto.castptr %c4096_i64_m1_cmg16_1 : i64 -> !pto.ptr + + %false_m1_cmg16_1 = arith.constant false + pto.mte_gm_ub %arg2, %ub_in_m1_cmg16_1, %c0_i64_m1_cmg16_1, %c128_i64_m1_cmg16_1 + nburst(%c32_i64_m1_cmg16_1, %c128_i64_m1_cmg16_1, %c128_i64_m1_cmg16_1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %__m1_cmg16_1:1 = scf.for %offset_m1_cmg16_1 = %c0_m1_cmg16_1 to %c1024_m1_cmg16_1 step %c64_m1_cmg16_1 iter_args(%remaining_m1_cmg16_1 = %c1024_i32_m1_cmg16_1) -> (i32) { + %mask_m1_cmg16_1, %next_remaining_m1_cmg16_1 = pto.plt_b32 %remaining_m1_cmg16_1 : i32 -> !pto.mask, i32 + %vec_m1_cmg16_1 = pto.vlds %ub_in_m1_cmg16_1[%offset_m1_cmg16_1] : !pto.ptr -> !pto.vreg<64xf32> + %out_m1_cmg16_1 = pto.vneg %vec_m1_cmg16_1, %mask_m1_cmg16_1 : !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> + pto.vsts %out_m1_cmg16_1, %ub_out_m1_cmg16_1[%offset_m1_cmg16_1], %mask_m1_cmg16_1 : !pto.vreg<64xf32>, !pto.ptr, !pto.mask + scf.yield %next_remaining_m1_cmg16_1 : i32 + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m1_cmg16_1, %arg3, %c128_i64_m1_cmg16_1 + nburst(%c32_i64_m1_cmg16_1, %c128_i64_m1_cmg16_1, %c128_i64_m1_cmg16_1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + } + + // inactive cross-case merge from micro-op/unary-vector/vsqrt + scf.if %__case_merge_guard { + + %__deep_merge_guard_cmg16_2 = arith.constant false + // inactive merged from vabs_kernel_2d + scf.if %__deep_merge_guard_cmg16_2 { + + %c0_m0_cmg16_2 = arith.constant 0 : index + %c1_m0_cmg16_2 = arith.constant 1 : index + %c64_m0_cmg16_2 = arith.constant 64 : index + %c1024_m0_cmg16_2 = arith.constant 1024 : index + %c0_i64_m0_cmg16_2 = arith.constant 0 : i64 + %c1_i64_m0_cmg16_2 = arith.constant 1 : i64 + %c32_i64_m0_cmg16_2 = arith.constant 32 : i64 + %c128_i64_m0_cmg16_2 = arith.constant 128 : i64 + %c4096_i64_m0_cmg16_2 = arith.constant 4096 : i64 + %c1024_i32_m0_cmg16_2 = arith.constant 1024 : i32 + + %ub_in_m0_cmg16_2 = pto.castptr %c0_i64_m0_cmg16_2 : i64 -> !pto.ptr + %ub_out_m0_cmg16_2 = pto.castptr %c4096_i64_m0_cmg16_2 : i64 -> !pto.ptr + + %false_m0_cmg16_2 = arith.constant false + pto.mte_gm_ub %arg0, %ub_in_m0_cmg16_2, %c0_i64_m0_cmg16_2, %c128_i64_m0_cmg16_2 + nburst(%c32_i64_m0_cmg16_2, %c128_i64_m0_cmg16_2, %c128_i64_m0_cmg16_2) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %__m0_cmg16_2:1 = scf.for %offset_m0_cmg16_2 = %c0_m0_cmg16_2 to %c1024_m0_cmg16_2 step %c64_m0_cmg16_2 iter_args(%remaining_m0_cmg16_2 = %c1024_i32_m0_cmg16_2) -> (i32) { + %mask_m0_cmg16_2, %next_remaining_m0_cmg16_2 = pto.plt_b32 %remaining_m0_cmg16_2 : i32 -> !pto.mask, i32 + %vec_m0_cmg16_2 = pto.vlds %ub_in_m0_cmg16_2[%offset_m0_cmg16_2] : !pto.ptr -> !pto.vreg<64xf32> + %out_m0_cmg16_2 = pto.vsqrt %vec_m0_cmg16_2, %mask_m0_cmg16_2 : !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> + pto.vsts %out_m0_cmg16_2, %ub_out_m0_cmg16_2[%offset_m0_cmg16_2], %mask_m0_cmg16_2 : !pto.vreg<64xf32>, !pto.ptr, !pto.mask + scf.yield %next_remaining_m0_cmg16_2 : i32 + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m0_cmg16_2, %arg1, %c128_i64_m0_cmg16_2 + nburst(%c32_i64_m0_cmg16_2, %c128_i64_m0_cmg16_2, %c128_i64_m0_cmg16_2) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + + } + // active merged from vexp_kernel_2d + + %c0_m1_cmg16_2 = arith.constant 0 : index + %c1_m1_cmg16_2 = arith.constant 1 : index + %c64_m1_cmg16_2 = arith.constant 64 : index + %c1024_m1_cmg16_2 = arith.constant 1024 : index + %c0_i64_m1_cmg16_2 = arith.constant 0 : i64 + %c1_i64_m1_cmg16_2 = arith.constant 1 : i64 + %c32_i64_m1_cmg16_2 = arith.constant 32 : i64 + %c128_i64_m1_cmg16_2 = arith.constant 128 : i64 + %c4096_i64_m1_cmg16_2 = arith.constant 4096 : i64 + %c1024_i32_m1_cmg16_2 = arith.constant 1024 : i32 + + %ub_in_m1_cmg16_2 = pto.castptr %c0_i64_m1_cmg16_2 : i64 -> !pto.ptr + %ub_out_m1_cmg16_2 = pto.castptr %c4096_i64_m1_cmg16_2 : i64 -> !pto.ptr + + %false_m1_cmg16_2 = arith.constant false + pto.mte_gm_ub %arg2, %ub_in_m1_cmg16_2, %c0_i64_m1_cmg16_2, %c128_i64_m1_cmg16_2 + nburst(%c32_i64_m1_cmg16_2, %c128_i64_m1_cmg16_2, %c128_i64_m1_cmg16_2) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %__m1_cmg16_2:1 = scf.for %offset_m1_cmg16_2 = %c0_m1_cmg16_2 to %c1024_m1_cmg16_2 step %c64_m1_cmg16_2 iter_args(%remaining_m1_cmg16_2 = %c1024_i32_m1_cmg16_2) -> (i32) { + %mask_m1_cmg16_2, %next_remaining_m1_cmg16_2 = pto.plt_b32 %remaining_m1_cmg16_2 : i32 -> !pto.mask, i32 + %vec_m1_cmg16_2 = pto.vlds %ub_in_m1_cmg16_2[%offset_m1_cmg16_2] : !pto.ptr -> !pto.vreg<64xf32> + %out_m1_cmg16_2 = pto.vsqrt %vec_m1_cmg16_2, %mask_m1_cmg16_2 : !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> + pto.vsts %out_m1_cmg16_2, %ub_out_m1_cmg16_2[%offset_m1_cmg16_2], %mask_m1_cmg16_2 : !pto.vreg<64xf32>, !pto.ptr, !pto.mask + scf.yield %next_remaining_m1_cmg16_2 : i32 + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m1_cmg16_2, %arg3, %c128_i64_m1_cmg16_2 + nburst(%c32_i64_m1_cmg16_2, %c128_i64_m1_cmg16_2, %c128_i64_m1_cmg16_2) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + } + return } } diff --git a/test/vpto/cases/micro-op/unary-vector/vln/launch.cpp b/test/vpto/cases/micro-op/unary-vector/vln/launch.cpp index b6d8cdbf0..a9a11e97e 100644 --- a/test/vpto/cases/micro-op/unary-vector/vln/launch.cpp +++ b/test/vpto/cases/micro-op/unary-vector/vln/launch.cpp @@ -5,22 +5,9 @@ // THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, // INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. // See LICENSE in the root of the software repository for the full text of the License. - -// --------------------------------------------------------------------------- -// PTOAS compatibility layer -// -// The upstream pto-isa headers reference some FP8/FP4 types and the -// __VEC_SCOPE__ marker that are not available on every AICore arch/toolchain -// combination (e.g. __NPU_ARCH__==2201). -// -// For our PTOAS-generated kernels we don't rely on these types today, but the -// headers still mention them in templates/static_asserts. Provide minimal -// fallbacks to keep compilation working on dav-c220. -// --------------------------------------------------------------------------- #ifndef __VEC_SCOPE__ #define __VEC_SCOPE__ #endif - #if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) typedef struct { unsigned char v; } hifloat8_t; typedef struct { unsigned char v; } float8_e4m3_t; @@ -30,33 +17,26 @@ typedef struct { unsigned char v; } float4_e1m2x2_t; typedef struct { unsigned char v; } float4_e2m1x2_t; #endif #include - -// AICore printf support is gated behind `--cce-enable-print` on some -// toolchains. When enabled, include the CCE print header so `cce::printf` -// resolves in device compilation. #if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) #include #endif - -// Some PTO-ISA types are only available in the __CCE_AICORE__ compilation -// path, but `bisheng -xcce` still performs a host-side parse pass. -// Provide minimal fallbacks only when the corresponding header wasn't -// pulled in by the selected arch implementation. #if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; +struct MrgSortExecutedNumList { uint16_t mrgSortList0,mrgSortList1,mrgSortList2,mrgSortList3; }; #endif #ifndef __CPU_SIM #include "acl/acl.h" #endif -extern "C" __global__ [aicore] void vexp_kernel_2d(__gm__ float *v1, - __gm__ float *v2); +extern "C" __global__ [aicore] void vln_deep_merged_kernel( + __gm__ float * arg0, + __gm__ float * arg1, + __gm__ float * arg2, + __gm__ float * arg3); -void LaunchVexp_kernel_2d(float *v1, float *v2, void *stream) { - vexp_kernel_2d<<<1, nullptr, stream>>>((__gm__ float *)v1, (__gm__ float *)v2); +void LaunchVlnDeepMerged(float * p0, float * p1, void *stream) { + vln_deep_merged_kernel<<<1, nullptr, stream>>>( + (__gm__ float *)p0, + (__gm__ float *)p0, + (__gm__ float *)p0, + (__gm__ float *)p1); } diff --git a/test/vpto/cases/micro-op/unary-vector/vln/main.cpp b/test/vpto/cases/micro-op/unary-vector/vln/main.cpp index f864622ca..b90e36242 100644 --- a/test/vpto/cases/micro-op/unary-vector/vln/main.cpp +++ b/test/vpto/cases/micro-op/unary-vector/vln/main.cpp @@ -47,8 +47,8 @@ struct MrgSortExecutedNumList { } \ } while (0) -void LaunchVexp_kernel_2d(float *v1, float *v2, void *stream); +void LaunchVlnDeepMerged(float * p0, float * p1, void *stream); int main() { size_t elemCount_v1 = 1024; size_t fileSize_v1 = elemCount_v1 * sizeof(float); @@ -83,7 +83,7 @@ int main() { ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, ACL_MEMCPY_HOST_TO_DEVICE)); ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVexp_kernel_2d(v1Device, v2Device, stream); + LaunchVlnDeepMerged(v1Device, v2Device, stream); ACL_CHECK(aclrtSynchronizeStream(stream)); ACL_CHECK(aclrtMemcpy(v2Host, fileSize_v2, v2Device, fileSize_v2, ACL_MEMCPY_DEVICE_TO_HOST)); diff --git a/test/vpto/cases/micro-op/unary-vector/vneg-f32-exceptional/compare.py b/test/vpto/cases/micro-op/unary-vector/vneg-f32-exceptional/compare.py deleted file mode 100755 index 1030c959f..000000000 --- a/test/vpto/cases/micro-op/unary-vector/vneg-f32-exceptional/compare.py +++ /dev/null @@ -1,209 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/unary-vector/vneg-f32-exceptional -# family: unary-vector -# target_ops: pto.vneg -# scenarios: core-f32, exceptional-values -# NOTE: bulk-generated coverage skeleton. -# coding=utf-8 - -import os -import sys -import numpy as np - - -REPEAT_BYTES = 256 - - -def _ceil_div(x, y): - return (x + y - 1) // y - - -def _packed_pred_storage_bytes(logical_elems, src_elem_bytes): - logical_elems = int(logical_elems) - src_elem_bytes = int(src_elem_bytes) - if logical_elems <= 0: - raise ValueError(f"logical_elems must be > 0, got {logical_elems}") - if src_elem_bytes not in (1, 2, 4): - raise ValueError(f"unsupported packed predicate source size: {src_elem_bytes}") - - repeat_elems = REPEAT_BYTES // src_elem_bytes - if src_elem_bytes == 4: - repeat_times = _ceil_div(logical_elems, repeat_elems) + 1 - loop_count = repeat_times // 2 - return loop_count * 16 - - repeat_times = _ceil_div(logical_elems, repeat_elems) - return repeat_times * (repeat_elems // 8) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - g = golden.astype(np.int64, copy=False) - o = output.astype(np.int64, copy=False) - else: - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - abs_diff = np.abs(g - o) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={g[idx]}, out={o[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_bin_prefix(golden_path, output_path, dtype, eps, count): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - try: - count = int(count) - except Exception: - print(f"[ERROR] Invalid prefix count: {count}") - return False - if count <= 0: - print(f"[ERROR] Invalid prefix count: {count}") - return False - - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np, count=count) - output = np.fromfile(output_path, dtype=dtype_np, count=count) - - if golden.size != count or output.size != count: - print( - f"[ERROR] Prefix read too small: need={count} elems, " - f"golden={golden.size}, out={output.size}" - ) - return False - - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - g = golden.astype(np.int64, copy=False) - o = output.astype(np.int64, copy=False) - else: - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - abs_diff = np.abs(g - o) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch (prefix): {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={g[idx]}, out={o[idx]}, dtype={dtype_np}, count={count})" - ) - else: - print(f"[ERROR] Mismatch (prefix): {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, logical_elems, src_elem_bytes): - """ - Compare outputs of pto.tcmp / pto.tcmps. - - PTO-ISA stores packed predicate results as a linear PK byte stream via - `psts`, with the exact written prefix length determined by the typed - TCMP/TCMPS repeat schedule. Compare only that semantic prefix. - """ - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - try: - logical_elems = int(logical_elems) - src_elem_bytes = int(src_elem_bytes) - except Exception: - print( - "[ERROR] Invalid packed mask compare arguments: " - f"logical_elems={logical_elems} src_elem_bytes={src_elem_bytes}" - ) - return False - if logical_elems <= 0 or src_elem_bytes <= 0: - print( - "[ERROR] Invalid packed mask compare arguments: " - f"logical_elems={logical_elems} src_elem_bytes={src_elem_bytes}" - ) - return False - - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - try: - prefix_bytes = _packed_pred_storage_bytes(logical_elems, src_elem_bytes) - except ValueError as exc: - print(f"[ERROR] {exc}") - return False - - if golden.size < prefix_bytes or output.size < prefix_bytes: - print( - f"[ERROR] Packed mask buffer too small: need={prefix_bytes} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - - golden_sel = golden[:prefix_bytes] - output_sel = output[:prefix_bytes] - - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, " - f"logical_elems={logical_elems}, src_elem_bytes={src_elem_bytes}, prefix_bytes={prefix_bytes})" - ) - return False - return True - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = True - ok = compare_bin("golden_v2.bin", "v2.bin", np.float32, 0.0001) and ok - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/unary-vector/vneg-f32-exceptional/golden.py b/test/vpto/cases/micro-op/unary-vector/vneg-f32-exceptional/golden.py deleted file mode 100755 index 0f394e5ed..000000000 --- a/test/vpto/cases/micro-op/unary-vector/vneg-f32-exceptional/golden.py +++ /dev/null @@ -1,65 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/unary-vector/vneg-f32-exceptional -# family: unary-vector -# target_ops: pto.vneg -# scenarios: core-f32, exceptional-values -# NOTE: bulk-generated coverage skeleton. -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -ROWS = 32 -COLS = 32 -SEED = 19 - - -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) - v1 = rng.uniform(-8.0, 8.0, size=(ROWS, COLS)).astype(np.float32) - flat = v1.reshape(-1) - flat[:8] = np.array( - [ - np.float32(0.0), - np.float32(-0.0), - np.float32(np.inf), - np.float32(-np.inf), - np.float32(np.nan), - np.float32(1.0), - np.float32(-1.0), - np.float32(3.5), - ], - dtype=np.float32, - ) - v2 = np.zeros((ROWS, COLS), dtype=np.float32) - golden_v2 = np.negative(v1).astype(np.float32, copy=False) - - output_dir.mkdir(parents=True, exist_ok=True) - v1.reshape(-1).tofile(output_dir / "v1.bin") - v2.reshape(-1).tofile(output_dir / "v2.bin") - golden_v2.reshape(-1).tofile(output_dir / "golden_v2.bin") - - -def main() -> None: - parser = argparse.ArgumentParser( - description="Generate numpy-based inputs/golden for VPTO micro-op vneg exceptional validation." - ) - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/unary-vector/vneg-f32-exceptional/kernel.pto b/test/vpto/cases/micro-op/unary-vector/vneg-f32-exceptional/kernel.pto deleted file mode 100644 index 9efd8ff9b..000000000 --- a/test/vpto/cases/micro-op/unary-vector/vneg-f32-exceptional/kernel.pto +++ /dev/null @@ -1,70 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/unary-vector/vneg-f32-exceptional -// family: unary-vector -// target_ops: pto.vneg -// scenarios: core-f32, exceptional-values -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -// ============================================================================= -// abs_kernel_2d: Element-wise absolute value on a 32x32 f32 tile -// ============================================================================= -// This kernel computes abs(input) for a 32x32 float32 matrix (1024 elements). -// -// Memory Layout: -// - Input: arg0 -> GM (Global Memory) -// - Output: arg1 -> GM (Global Memory) -// - UB (Unified Buffer) at offset 0: input tile (4096 bytes = 32*32*4) -// - UB at offset 4096: output tile (4096 bytes = 32*32*4) -// -// Pipeline: -// 1. DMA: GM -> UB (MTE2 pipe) - copy input tile to UB -// 2. Sync: wait for MTE2 -> V pipe handoff -// 3. Compute: vabs on 64-element vectors (V pipe) - 16 iterations for 1024 elements -// 4. Sync: wait for V -> MTE3 pipe handoff -// 5. DMA: UB -> GM (MTE3 pipe) - copy result tile back to GM -// ============================================================================= - -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vabs_kernel_2d(%arg0: !pto.ptr, %arg1: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c64 = arith.constant 64 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c128_i64 = arith.constant 128 : i64 - %c4096_i64 = arith.constant 4096 : i64 - %c1024_i32 = arith.constant 1024 : i32 - - %ub_in = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c4096_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_in, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %_:1 = scf.for %offset = %c0 to %c1024 step %c64 iter_args(%remaining = %c1024_i32) -> (i32) { - %mask, %next_remaining = pto.plt_b32 %remaining : i32 -> !pto.mask, i32 - %vec = pto.vlds %ub_in[%offset] : !pto.ptr -> !pto.vreg<64xf32> - %out = pto.vneg %vec, %mask : !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> - pto.vsts %out, %ub_out[%offset], %mask : !pto.vreg<64xf32>, !pto.ptr, !pto.mask - scf.yield %next_remaining : i32 - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg1, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/unary-vector/vneg-f32-exceptional/launch.cpp b/test/vpto/cases/micro-op/unary-vector/vneg-f32-exceptional/launch.cpp deleted file mode 100644 index 2614d8040..000000000 --- a/test/vpto/cases/micro-op/unary-vector/vneg-f32-exceptional/launch.cpp +++ /dev/null @@ -1,70 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/unary-vector/vneg-f32-exceptional -// family: unary-vector -// target_ops: pto.vneg -// scenarios: core-f32, exceptional-values -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -// --------------------------------------------------------------------------- -// PTOAS compatibility layer -// -// The upstream pto-isa headers reference some FP8/FP4 types and the -// __VEC_SCOPE__ marker that are not available on every AICore arch/toolchain -// combination (e.g. __NPU_ARCH__==2201). -// -// For our PTOAS-generated kernels we don't rely on these types today, but the -// headers still mention them in templates/static_asserts. Provide minimal -// fallbacks to keep compilation working on dav-c220. -// --------------------------------------------------------------------------- -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include - -// AICore printf support is gated behind `--cce-enable-print` on some -// toolchains. When enabled, include the CCE print header so `cce::printf` -// resolves in device compilation. -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -// Some PTO-ISA types are only available in the __CCE_AICORE__ compilation -// path, but `bisheng -xcce` still performs a host-side parse pass. -// Provide minimal fallbacks only when the corresponding header wasn't -// pulled in by the selected arch implementation. -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vabs_kernel_2d(__gm__ float *v1, - __gm__ float *v2); - -void LaunchVabs_kernel_2d(float *v1, float *v2, void *stream) { - vabs_kernel_2d<<<1, nullptr, stream>>>((__gm__ float *)v1, (__gm__ float *)v2); -} diff --git a/test/vpto/cases/micro-op/unary-vector/vneg-f32-exceptional/main.cpp b/test/vpto/cases/micro-op/unary-vector/vneg-f32-exceptional/main.cpp deleted file mode 100644 index de8fba973..000000000 --- a/test/vpto/cases/micro-op/unary-vector/vneg-f32-exceptional/main.cpp +++ /dev/null @@ -1,130 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/unary-vector/vneg-f32-exceptional -// family: unary-vector -// target_ops: pto.vneg -// scenarios: core-f32, exceptional-values -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -This program is free software, you can redistribute it and/or modify it under the terms and conditions of -CANN Open Software License Agreement Version 2.0 (the "License"). -Please refer to the License for details. You may not use this file except in compliance with the License. -THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -See LICENSE in the root of the software repository for the full text of the License. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include -#include - -using namespace PtoTestCommon; - -#ifndef TMRGSORT_HPP -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, (int)_ret, __FILE__, __LINE__); \ - const char *_recent = aclGetRecentErrMsg(); \ - if (_recent != nullptr && _recent[0] != '\0') { \ - std::fprintf(stderr, "[ERROR] RecentErrMsg: %s\n", _recent); \ - } \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVabs_kernel_2d(float *v1, float *v2, void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(float); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(float); - float *v1Host = nullptr; - float *v1Device = nullptr; - float *v2Host = nullptr; - float *v2Device = nullptr; - - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { - deviceId = std::atoi(envDevice); - } - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVabs_kernel_2d(v1Device, v2Device, stream); - - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v2Host, fileSize_v2, v2Device, fileSize_v2, ACL_MEMCPY_DEVICE_TO_HOST)); - - WriteFile("./v2.bin", v2Host, fileSize_v2); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - if (stream != nullptr) { - const aclError _ret = aclrtDestroyStream(stream); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtDestroyStream(stream)", (int)_ret, __FILE__, __LINE__); - } - stream = nullptr; - } - if (deviceSet) { - const aclError _ret = aclrtResetDevice(deviceId); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtResetDevice(deviceId)", (int)_ret, __FILE__, __LINE__); - } - } - if (aclInited) { - const aclError _ret = aclFinalize(); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclFinalize()", (int)_ret, __FILE__, __LINE__); - } - } - - return rc; -} diff --git a/test/vpto/cases/micro-op/unary-vector/vneg/compare.py b/test/vpto/cases/micro-op/unary-vector/vneg/compare.py deleted file mode 100755 index 0ce4e18b6..000000000 --- a/test/vpto/cases/micro-op/unary-vector/vneg/compare.py +++ /dev/null @@ -1,209 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/unary-vector/vneg -# family: unary-vector -# target_ops: pto.vneg -# scenarios: core-f32, full-mask -# NOTE: bulk-generated coverage skeleton. -# coding=utf-8 - -import os -import sys -import numpy as np - - -REPEAT_BYTES = 256 - - -def _ceil_div(x, y): - return (x + y - 1) // y - - -def _packed_pred_storage_bytes(logical_elems, src_elem_bytes): - logical_elems = int(logical_elems) - src_elem_bytes = int(src_elem_bytes) - if logical_elems <= 0: - raise ValueError(f"logical_elems must be > 0, got {logical_elems}") - if src_elem_bytes not in (1, 2, 4): - raise ValueError(f"unsupported packed predicate source size: {src_elem_bytes}") - - repeat_elems = REPEAT_BYTES // src_elem_bytes - if src_elem_bytes == 4: - repeat_times = _ceil_div(logical_elems, repeat_elems) + 1 - loop_count = repeat_times // 2 - return loop_count * 16 - - repeat_times = _ceil_div(logical_elems, repeat_elems) - return repeat_times * (repeat_elems // 8) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - g = golden.astype(np.int64, copy=False) - o = output.astype(np.int64, copy=False) - else: - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - abs_diff = np.abs(g - o) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={g[idx]}, out={o[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_bin_prefix(golden_path, output_path, dtype, eps, count): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - try: - count = int(count) - except Exception: - print(f"[ERROR] Invalid prefix count: {count}") - return False - if count <= 0: - print(f"[ERROR] Invalid prefix count: {count}") - return False - - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np, count=count) - output = np.fromfile(output_path, dtype=dtype_np, count=count) - - if golden.size != count or output.size != count: - print( - f"[ERROR] Prefix read too small: need={count} elems, " - f"golden={golden.size}, out={output.size}" - ) - return False - - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - g = golden.astype(np.int64, copy=False) - o = output.astype(np.int64, copy=False) - else: - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - abs_diff = np.abs(g - o) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch (prefix): {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={g[idx]}, out={o[idx]}, dtype={dtype_np}, count={count})" - ) - else: - print(f"[ERROR] Mismatch (prefix): {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, logical_elems, src_elem_bytes): - """ - Compare outputs of pto.tcmp / pto.tcmps. - - PTO-ISA stores packed predicate results as a linear PK byte stream via - `psts`, with the exact written prefix length determined by the typed - TCMP/TCMPS repeat schedule. Compare only that semantic prefix. - """ - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - try: - logical_elems = int(logical_elems) - src_elem_bytes = int(src_elem_bytes) - except Exception: - print( - "[ERROR] Invalid packed mask compare arguments: " - f"logical_elems={logical_elems} src_elem_bytes={src_elem_bytes}" - ) - return False - if logical_elems <= 0 or src_elem_bytes <= 0: - print( - "[ERROR] Invalid packed mask compare arguments: " - f"logical_elems={logical_elems} src_elem_bytes={src_elem_bytes}" - ) - return False - - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - try: - prefix_bytes = _packed_pred_storage_bytes(logical_elems, src_elem_bytes) - except ValueError as exc: - print(f"[ERROR] {exc}") - return False - - if golden.size < prefix_bytes or output.size < prefix_bytes: - print( - f"[ERROR] Packed mask buffer too small: need={prefix_bytes} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - - golden_sel = golden[:prefix_bytes] - output_sel = output[:prefix_bytes] - - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, " - f"logical_elems={logical_elems}, src_elem_bytes={src_elem_bytes}, prefix_bytes={prefix_bytes})" - ) - return False - return True - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = True - ok = compare_bin("golden_v2.bin", "v2.bin", np.float32, 0.0001) and ok - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/unary-vector/vneg/golden.py b/test/vpto/cases/micro-op/unary-vector/vneg/golden.py deleted file mode 100755 index a7e86608a..000000000 --- a/test/vpto/cases/micro-op/unary-vector/vneg/golden.py +++ /dev/null @@ -1,51 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/unary-vector/vneg -# family: unary-vector -# target_ops: pto.vneg -# scenarios: core-f32, full-mask -# NOTE: bulk-generated coverage skeleton. -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -ROWS = 32 -COLS = 32 -SEED = 19 - - -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) - v1 = rng.uniform(-8.0, 8.0, size=(ROWS, COLS)).astype(np.float32) - v2 = np.zeros((ROWS, COLS), dtype=np.float32) - golden_v2 = np.negative(v1).astype(np.float32, copy=False) - - output_dir.mkdir(parents=True, exist_ok=True) - v1.reshape(-1).tofile(output_dir / "v1.bin") - v2.reshape(-1).tofile(output_dir / "v2.bin") - golden_v2.reshape(-1).tofile(output_dir / "golden_v2.bin") - - -def main() -> None: - parser = argparse.ArgumentParser( - description="Generate numpy-based inputs/golden for VPTO micro-op vneg validation." - ) - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/unary-vector/vneg/kernel.pto b/test/vpto/cases/micro-op/unary-vector/vneg/kernel.pto deleted file mode 100644 index 5926ca4ba..000000000 --- a/test/vpto/cases/micro-op/unary-vector/vneg/kernel.pto +++ /dev/null @@ -1,70 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/unary-vector/vneg -// family: unary-vector -// target_ops: pto.vneg -// scenarios: core-f32, full-mask -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -// ============================================================================= -// abs_kernel_2d: Element-wise absolute value on a 32x32 f32 tile -// ============================================================================= -// This kernel computes abs(input) for a 32x32 float32 matrix (1024 elements). -// -// Memory Layout: -// - Input: arg0 -> GM (Global Memory) -// - Output: arg1 -> GM (Global Memory) -// - UB (Unified Buffer) at offset 0: input tile (4096 bytes = 32*32*4) -// - UB at offset 4096: output tile (4096 bytes = 32*32*4) -// -// Pipeline: -// 1. DMA: GM -> UB (MTE2 pipe) - copy input tile to UB -// 2. Sync: wait for MTE2 -> V pipe handoff -// 3. Compute: vabs on 64-element vectors (V pipe) - 16 iterations for 1024 elements -// 4. Sync: wait for V -> MTE3 pipe handoff -// 5. DMA: UB -> GM (MTE3 pipe) - copy result tile back to GM -// ============================================================================= - -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vabs_kernel_2d(%arg0: !pto.ptr, %arg1: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c64 = arith.constant 64 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c128_i64 = arith.constant 128 : i64 - %c4096_i64 = arith.constant 4096 : i64 - %c1024_i32 = arith.constant 1024 : i32 - - %ub_in = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c4096_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_in, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %_:1 = scf.for %offset = %c0 to %c1024 step %c64 iter_args(%remaining = %c1024_i32) -> (i32) { - %mask, %next_remaining = pto.plt_b32 %remaining : i32 -> !pto.mask, i32 - %vec = pto.vlds %ub_in[%offset] : !pto.ptr -> !pto.vreg<64xf32> - %out = pto.vneg %vec, %mask : !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> - pto.vsts %out, %ub_out[%offset], %mask : !pto.vreg<64xf32>, !pto.ptr, !pto.mask - scf.yield %next_remaining : i32 - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg1, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/unary-vector/vneg/launch.cpp b/test/vpto/cases/micro-op/unary-vector/vneg/launch.cpp deleted file mode 100644 index 65504cb9e..000000000 --- a/test/vpto/cases/micro-op/unary-vector/vneg/launch.cpp +++ /dev/null @@ -1,70 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/unary-vector/vneg -// family: unary-vector -// target_ops: pto.vneg -// scenarios: core-f32, full-mask -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -// --------------------------------------------------------------------------- -// PTOAS compatibility layer -// -// The upstream pto-isa headers reference some FP8/FP4 types and the -// __VEC_SCOPE__ marker that are not available on every AICore arch/toolchain -// combination (e.g. __NPU_ARCH__==2201). -// -// For our PTOAS-generated kernels we don't rely on these types today, but the -// headers still mention them in templates/static_asserts. Provide minimal -// fallbacks to keep compilation working on dav-c220. -// --------------------------------------------------------------------------- -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include - -// AICore printf support is gated behind `--cce-enable-print` on some -// toolchains. When enabled, include the CCE print header so `cce::printf` -// resolves in device compilation. -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -// Some PTO-ISA types are only available in the __CCE_AICORE__ compilation -// path, but `bisheng -xcce` still performs a host-side parse pass. -// Provide minimal fallbacks only when the corresponding header wasn't -// pulled in by the selected arch implementation. -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vabs_kernel_2d(__gm__ float *v1, - __gm__ float *v2); - -void LaunchVabs_kernel_2d(float *v1, float *v2, void *stream) { - vabs_kernel_2d<<<1, nullptr, stream>>>((__gm__ float *)v1, (__gm__ float *)v2); -} diff --git a/test/vpto/cases/micro-op/unary-vector/vneg/main.cpp b/test/vpto/cases/micro-op/unary-vector/vneg/main.cpp deleted file mode 100644 index 134aa5b2c..000000000 --- a/test/vpto/cases/micro-op/unary-vector/vneg/main.cpp +++ /dev/null @@ -1,130 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/unary-vector/vneg -// family: unary-vector -// target_ops: pto.vneg -// scenarios: core-f32, full-mask -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -This program is free software, you can redistribute it and/or modify it under the terms and conditions of -CANN Open Software License Agreement Version 2.0 (the "License"). -Please refer to the License for details. You may not use this file except in compliance with the License. -THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -See LICENSE in the root of the software repository for the full text of the License. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include -#include - -using namespace PtoTestCommon; - -#ifndef TMRGSORT_HPP -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, (int)_ret, __FILE__, __LINE__); \ - const char *_recent = aclGetRecentErrMsg(); \ - if (_recent != nullptr && _recent[0] != '\0') { \ - std::fprintf(stderr, "[ERROR] RecentErrMsg: %s\n", _recent); \ - } \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVabs_kernel_2d(float *v1, float *v2, void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(float); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(float); - float *v1Host = nullptr; - float *v1Device = nullptr; - float *v2Host = nullptr; - float *v2Device = nullptr; - - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { - deviceId = std::atoi(envDevice); - } - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVabs_kernel_2d(v1Device, v2Device, stream); - - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v2Host, fileSize_v2, v2Device, fileSize_v2, ACL_MEMCPY_DEVICE_TO_HOST)); - - WriteFile("./v2.bin", v2Host, fileSize_v2); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - if (stream != nullptr) { - const aclError _ret = aclrtDestroyStream(stream); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtDestroyStream(stream)", (int)_ret, __FILE__, __LINE__); - } - stream = nullptr; - } - if (deviceSet) { - const aclError _ret = aclrtResetDevice(deviceId); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtResetDevice(deviceId)", (int)_ret, __FILE__, __LINE__); - } - } - if (aclInited) { - const aclError _ret = aclFinalize(); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclFinalize()", (int)_ret, __FILE__, __LINE__); - } - } - - return rc; -} diff --git a/test/vpto/cases/micro-op/unary-vector/vsqrt-domain-boundary/compare.py b/test/vpto/cases/micro-op/unary-vector/vsqrt-domain-boundary/compare.py deleted file mode 100755 index 71d8b50c2..000000000 --- a/test/vpto/cases/micro-op/unary-vector/vsqrt-domain-boundary/compare.py +++ /dev/null @@ -1,209 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/unary-vector/vsqrt-domain-boundary -# family: unary-vector -# target_ops: pto.vsqrt -# scenarios: core-f32, domain-nonnegative, exceptional-values -# NOTE: bulk-generated coverage skeleton. -# coding=utf-8 - -import os -import sys -import numpy as np - - -REPEAT_BYTES = 256 - - -def _ceil_div(x, y): - return (x + y - 1) // y - - -def _packed_pred_storage_bytes(logical_elems, src_elem_bytes): - logical_elems = int(logical_elems) - src_elem_bytes = int(src_elem_bytes) - if logical_elems <= 0: - raise ValueError(f"logical_elems must be > 0, got {logical_elems}") - if src_elem_bytes not in (1, 2, 4): - raise ValueError(f"unsupported packed predicate source size: {src_elem_bytes}") - - repeat_elems = REPEAT_BYTES // src_elem_bytes - if src_elem_bytes == 4: - repeat_times = _ceil_div(logical_elems, repeat_elems) + 1 - loop_count = repeat_times // 2 - return loop_count * 16 - - repeat_times = _ceil_div(logical_elems, repeat_elems) - return repeat_times * (repeat_elems // 8) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - g = golden.astype(np.int64, copy=False) - o = output.astype(np.int64, copy=False) - else: - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - abs_diff = np.abs(g - o) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={g[idx]}, out={o[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_bin_prefix(golden_path, output_path, dtype, eps, count): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - try: - count = int(count) - except Exception: - print(f"[ERROR] Invalid prefix count: {count}") - return False - if count <= 0: - print(f"[ERROR] Invalid prefix count: {count}") - return False - - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np, count=count) - output = np.fromfile(output_path, dtype=dtype_np, count=count) - - if golden.size != count or output.size != count: - print( - f"[ERROR] Prefix read too small: need={count} elems, " - f"golden={golden.size}, out={output.size}" - ) - return False - - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - g = golden.astype(np.int64, copy=False) - o = output.astype(np.int64, copy=False) - else: - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - abs_diff = np.abs(g - o) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch (prefix): {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={g[idx]}, out={o[idx]}, dtype={dtype_np}, count={count})" - ) - else: - print(f"[ERROR] Mismatch (prefix): {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, logical_elems, src_elem_bytes): - """ - Compare outputs of pto.tcmp / pto.tcmps. - - PTO-ISA stores packed predicate results as a linear PK byte stream via - `psts`, with the exact written prefix length determined by the typed - TCMP/TCMPS repeat schedule. Compare only that semantic prefix. - """ - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - try: - logical_elems = int(logical_elems) - src_elem_bytes = int(src_elem_bytes) - except Exception: - print( - "[ERROR] Invalid packed mask compare arguments: " - f"logical_elems={logical_elems} src_elem_bytes={src_elem_bytes}" - ) - return False - if logical_elems <= 0 or src_elem_bytes <= 0: - print( - "[ERROR] Invalid packed mask compare arguments: " - f"logical_elems={logical_elems} src_elem_bytes={src_elem_bytes}" - ) - return False - - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - try: - prefix_bytes = _packed_pred_storage_bytes(logical_elems, src_elem_bytes) - except ValueError as exc: - print(f"[ERROR] {exc}") - return False - - if golden.size < prefix_bytes or output.size < prefix_bytes: - print( - f"[ERROR] Packed mask buffer too small: need={prefix_bytes} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - - golden_sel = golden[:prefix_bytes] - output_sel = output[:prefix_bytes] - - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, " - f"logical_elems={logical_elems}, src_elem_bytes={src_elem_bytes}, prefix_bytes={prefix_bytes})" - ) - return False - return True - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = True - ok = compare_bin("golden_v2.bin", "v2.bin", np.float32, 0.0001) and ok - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/unary-vector/vsqrt-domain-boundary/golden.py b/test/vpto/cases/micro-op/unary-vector/vsqrt-domain-boundary/golden.py deleted file mode 100755 index 9607fbcc5..000000000 --- a/test/vpto/cases/micro-op/unary-vector/vsqrt-domain-boundary/golden.py +++ /dev/null @@ -1,65 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/unary-vector/vsqrt-domain-boundary -# family: unary-vector -# target_ops: pto.vsqrt -# scenarios: core-f32, domain-nonnegative, exceptional-values -# NOTE: bulk-generated coverage skeleton. -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -ROWS = 32 -COLS = 32 -SEED = 19 - - -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) - v1 = rng.uniform(0.0, 16.0, size=(ROWS, COLS)).astype(np.float32) - flat = v1.reshape(-1) - flat[:8] = np.array( - [ - np.float32(0.0), - np.nextafter(np.float32(0.0), np.float32(1.0), dtype=np.float32), - np.float32(1.0), - np.float32(4.0), - np.float32(9.0), - np.float32(16.0), - np.float32(1024.0), - np.float32(np.finfo(np.float32).max), - ], - dtype=np.float32, - ) - v2 = np.zeros((ROWS, COLS), dtype=np.float32) - golden_v2 = np.sqrt(v1).astype(np.float32, copy=False) - - output_dir.mkdir(parents=True, exist_ok=True) - v1.reshape(-1).tofile(output_dir / "v1.bin") - v2.reshape(-1).tofile(output_dir / "v2.bin") - golden_v2.reshape(-1).tofile(output_dir / "golden_v2.bin") - - -def main() -> None: - parser = argparse.ArgumentParser( - description="Generate numpy-based inputs/golden for VPTO micro-op vsqrt domain-boundary validation." - ) - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/unary-vector/vsqrt-domain-boundary/kernel.pto b/test/vpto/cases/micro-op/unary-vector/vsqrt-domain-boundary/kernel.pto deleted file mode 100644 index f7c39b33a..000000000 --- a/test/vpto/cases/micro-op/unary-vector/vsqrt-domain-boundary/kernel.pto +++ /dev/null @@ -1,70 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/unary-vector/vsqrt-domain-boundary -// family: unary-vector -// target_ops: pto.vsqrt -// scenarios: core-f32, domain-nonnegative, exceptional-values -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -// ============================================================================= -// abs_kernel_2d: Element-wise absolute value on a 32x32 f32 tile -// ============================================================================= -// This kernel computes abs(input) for a 32x32 float32 matrix (1024 elements). -// -// Memory Layout: -// - Input: arg0 -> GM (Global Memory) -// - Output: arg1 -> GM (Global Memory) -// - UB (Unified Buffer) at offset 0: input tile (4096 bytes = 32*32*4) -// - UB at offset 4096: output tile (4096 bytes = 32*32*4) -// -// Pipeline: -// 1. DMA: GM -> UB (MTE2 pipe) - copy input tile to UB -// 2. Sync: wait for MTE2 -> V pipe handoff -// 3. Compute: vabs on 64-element vectors (V pipe) - 16 iterations for 1024 elements -// 4. Sync: wait for V -> MTE3 pipe handoff -// 5. DMA: UB -> GM (MTE3 pipe) - copy result tile back to GM -// ============================================================================= - -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vabs_kernel_2d(%arg0: !pto.ptr, %arg1: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c64 = arith.constant 64 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c128_i64 = arith.constant 128 : i64 - %c4096_i64 = arith.constant 4096 : i64 - %c1024_i32 = arith.constant 1024 : i32 - - %ub_in = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c4096_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_in, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %_:1 = scf.for %offset = %c0 to %c1024 step %c64 iter_args(%remaining = %c1024_i32) -> (i32) { - %mask, %next_remaining = pto.plt_b32 %remaining : i32 -> !pto.mask, i32 - %vec = pto.vlds %ub_in[%offset] : !pto.ptr -> !pto.vreg<64xf32> - %out = pto.vsqrt %vec, %mask : !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> - pto.vsts %out, %ub_out[%offset], %mask : !pto.vreg<64xf32>, !pto.ptr, !pto.mask - scf.yield %next_remaining : i32 - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg1, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/unary-vector/vsqrt-domain-boundary/launch.cpp b/test/vpto/cases/micro-op/unary-vector/vsqrt-domain-boundary/launch.cpp deleted file mode 100644 index 1070e35b1..000000000 --- a/test/vpto/cases/micro-op/unary-vector/vsqrt-domain-boundary/launch.cpp +++ /dev/null @@ -1,70 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/unary-vector/vsqrt-domain-boundary -// family: unary-vector -// target_ops: pto.vsqrt -// scenarios: core-f32, domain-nonnegative, exceptional-values -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -// --------------------------------------------------------------------------- -// PTOAS compatibility layer -// -// The upstream pto-isa headers reference some FP8/FP4 types and the -// __VEC_SCOPE__ marker that are not available on every AICore arch/toolchain -// combination (e.g. __NPU_ARCH__==2201). -// -// For our PTOAS-generated kernels we don't rely on these types today, but the -// headers still mention them in templates/static_asserts. Provide minimal -// fallbacks to keep compilation working on dav-c220. -// --------------------------------------------------------------------------- -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include - -// AICore printf support is gated behind `--cce-enable-print` on some -// toolchains. When enabled, include the CCE print header so `cce::printf` -// resolves in device compilation. -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -// Some PTO-ISA types are only available in the __CCE_AICORE__ compilation -// path, but `bisheng -xcce` still performs a host-side parse pass. -// Provide minimal fallbacks only when the corresponding header wasn't -// pulled in by the selected arch implementation. -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vabs_kernel_2d(__gm__ float *v1, - __gm__ float *v2); - -void LaunchVabs_kernel_2d(float *v1, float *v2, void *stream) { - vabs_kernel_2d<<<1, nullptr, stream>>>((__gm__ float *)v1, (__gm__ float *)v2); -} diff --git a/test/vpto/cases/micro-op/unary-vector/vsqrt-domain-boundary/main.cpp b/test/vpto/cases/micro-op/unary-vector/vsqrt-domain-boundary/main.cpp deleted file mode 100644 index 95bed1286..000000000 --- a/test/vpto/cases/micro-op/unary-vector/vsqrt-domain-boundary/main.cpp +++ /dev/null @@ -1,130 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/unary-vector/vsqrt-domain-boundary -// family: unary-vector -// target_ops: pto.vsqrt -// scenarios: core-f32, domain-nonnegative, exceptional-values -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -This program is free software, you can redistribute it and/or modify it under the terms and conditions of -CANN Open Software License Agreement Version 2.0 (the "License"). -Please refer to the License for details. You may not use this file except in compliance with the License. -THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -See LICENSE in the root of the software repository for the full text of the License. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include -#include - -using namespace PtoTestCommon; - -#ifndef TMRGSORT_HPP -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, (int)_ret, __FILE__, __LINE__); \ - const char *_recent = aclGetRecentErrMsg(); \ - if (_recent != nullptr && _recent[0] != '\0') { \ - std::fprintf(stderr, "[ERROR] RecentErrMsg: %s\n", _recent); \ - } \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVabs_kernel_2d(float *v1, float *v2, void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(float); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(float); - float *v1Host = nullptr; - float *v1Device = nullptr; - float *v2Host = nullptr; - float *v2Device = nullptr; - - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { - deviceId = std::atoi(envDevice); - } - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVabs_kernel_2d(v1Device, v2Device, stream); - - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v2Host, fileSize_v2, v2Device, fileSize_v2, ACL_MEMCPY_DEVICE_TO_HOST)); - - WriteFile("./v2.bin", v2Host, fileSize_v2); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - if (stream != nullptr) { - const aclError _ret = aclrtDestroyStream(stream); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtDestroyStream(stream)", (int)_ret, __FILE__, __LINE__); - } - stream = nullptr; - } - if (deviceSet) { - const aclError _ret = aclrtResetDevice(deviceId); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtResetDevice(deviceId)", (int)_ret, __FILE__, __LINE__); - } - } - if (aclInited) { - const aclError _ret = aclFinalize(); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclFinalize()", (int)_ret, __FILE__, __LINE__); - } - } - - return rc; -} diff --git a/test/vpto/cases/micro-op/unary-vector/vsqrt/compare.py b/test/vpto/cases/micro-op/unary-vector/vsqrt/compare.py deleted file mode 100644 index 962985a24..000000000 --- a/test/vpto/cases/micro-op/unary-vector/vsqrt/compare.py +++ /dev/null @@ -1,204 +0,0 @@ -#!/usr/bin/python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# coding=utf-8 - -import os -import sys -import numpy as np - - -REPEAT_BYTES = 256 - - -def _ceil_div(x, y): - return (x + y - 1) // y - - -def _packed_pred_storage_bytes(logical_elems, src_elem_bytes): - logical_elems = int(logical_elems) - src_elem_bytes = int(src_elem_bytes) - if logical_elems <= 0: - raise ValueError(f"logical_elems must be > 0, got {logical_elems}") - if src_elem_bytes not in (1, 2, 4): - raise ValueError(f"unsupported packed predicate source size: {src_elem_bytes}") - - repeat_elems = REPEAT_BYTES // src_elem_bytes - if src_elem_bytes == 4: - repeat_times = _ceil_div(logical_elems, repeat_elems) + 1 - loop_count = repeat_times // 2 - return loop_count * 16 - - repeat_times = _ceil_div(logical_elems, repeat_elems) - return repeat_times * (repeat_elems // 8) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - g = golden.astype(np.int64, copy=False) - o = output.astype(np.int64, copy=False) - else: - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - abs_diff = np.abs(g - o) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={g[idx]}, out={o[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_bin_prefix(golden_path, output_path, dtype, eps, count): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - try: - count = int(count) - except Exception: - print(f"[ERROR] Invalid prefix count: {count}") - return False - if count <= 0: - print(f"[ERROR] Invalid prefix count: {count}") - return False - - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np, count=count) - output = np.fromfile(output_path, dtype=dtype_np, count=count) - - if golden.size != count or output.size != count: - print( - f"[ERROR] Prefix read too small: need={count} elems, " - f"golden={golden.size}, out={output.size}" - ) - return False - - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - g = golden.astype(np.int64, copy=False) - o = output.astype(np.int64, copy=False) - else: - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - abs_diff = np.abs(g - o) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch (prefix): {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={g[idx]}, out={o[idx]}, dtype={dtype_np}, count={count})" - ) - else: - print(f"[ERROR] Mismatch (prefix): {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, logical_elems, src_elem_bytes): - """ - Compare outputs of pto.tcmp / pto.tcmps. - - PTO-ISA stores packed predicate results as a linear PK byte stream via - `psts`, with the exact written prefix length determined by the typed - TCMP/TCMPS repeat schedule. Compare only that semantic prefix. - """ - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - try: - logical_elems = int(logical_elems) - src_elem_bytes = int(src_elem_bytes) - except Exception: - print( - "[ERROR] Invalid packed mask compare arguments: " - f"logical_elems={logical_elems} src_elem_bytes={src_elem_bytes}" - ) - return False - if logical_elems <= 0 or src_elem_bytes <= 0: - print( - "[ERROR] Invalid packed mask compare arguments: " - f"logical_elems={logical_elems} src_elem_bytes={src_elem_bytes}" - ) - return False - - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - try: - prefix_bytes = _packed_pred_storage_bytes(logical_elems, src_elem_bytes) - except ValueError as exc: - print(f"[ERROR] {exc}") - return False - - if golden.size < prefix_bytes or output.size < prefix_bytes: - print( - f"[ERROR] Packed mask buffer too small: need={prefix_bytes} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - - golden_sel = golden[:prefix_bytes] - output_sel = output[:prefix_bytes] - - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, " - f"logical_elems={logical_elems}, src_elem_bytes={src_elem_bytes}, prefix_bytes={prefix_bytes})" - ) - return False - return True - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = True - ok = compare_bin("golden_v2.bin", "v2.bin", np.float32, 0.0001) and ok - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/unary-vector/vsqrt/golden.py b/test/vpto/cases/micro-op/unary-vector/vsqrt/golden.py deleted file mode 100644 index a5739a6b3..000000000 --- a/test/vpto/cases/micro-op/unary-vector/vsqrt/golden.py +++ /dev/null @@ -1,47 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -ROWS = 32 -COLS = 32 -SEED = 19 - - -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) - base = rng.uniform(0.0, 4.0, size=(ROWS, COLS)).astype(np.float32) - v1 = np.square(base).astype(np.float32, copy=False) - v2 = np.zeros((ROWS, COLS), dtype=np.float32) - golden_v2 = np.sqrt(v1).astype(np.float32, copy=False) - - output_dir.mkdir(parents=True, exist_ok=True) - v1.reshape(-1).tofile(output_dir / "v1.bin") - v2.reshape(-1).tofile(output_dir / "v2.bin") - golden_v2.reshape(-1).tofile(output_dir / "golden_v2.bin") - - -def main() -> None: - parser = argparse.ArgumentParser( - description="Generate numpy-based inputs/golden for VPTO micro-op vsqrt validation." - ) - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/unary-vector/vsqrt/kernel.pto b/test/vpto/cases/micro-op/unary-vector/vsqrt/kernel.pto deleted file mode 100644 index af8858995..000000000 --- a/test/vpto/cases/micro-op/unary-vector/vsqrt/kernel.pto +++ /dev/null @@ -1,43 +0,0 @@ -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vexp_kernel_2d(%arg0: !pto.ptr, %arg1: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c64 = arith.constant 64 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c128_i64 = arith.constant 128 : i64 - %c4096_i64 = arith.constant 4096 : i64 - %c1024_i32 = arith.constant 1024 : i32 - - %ub_in = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c4096_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_in, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %_:1 = scf.for %offset = %c0 to %c1024 step %c64 iter_args(%remaining = %c1024_i32) -> (i32) { - %mask, %next_remaining = pto.plt_b32 %remaining : i32 -> !pto.mask, i32 - %vec = pto.vlds %ub_in[%offset] : !pto.ptr -> !pto.vreg<64xf32> - %out = pto.vsqrt %vec, %mask : !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> - pto.vsts %out, %ub_out[%offset], %mask : !pto.vreg<64xf32>, !pto.ptr, !pto.mask - scf.yield %next_remaining : i32 - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg1, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/unary-vector/vsqrt/launch.cpp b/test/vpto/cases/micro-op/unary-vector/vsqrt/launch.cpp deleted file mode 100644 index b6d8cdbf0..000000000 --- a/test/vpto/cases/micro-op/unary-vector/vsqrt/launch.cpp +++ /dev/null @@ -1,62 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// --------------------------------------------------------------------------- -// PTOAS compatibility layer -// -// The upstream pto-isa headers reference some FP8/FP4 types and the -// __VEC_SCOPE__ marker that are not available on every AICore arch/toolchain -// combination (e.g. __NPU_ARCH__==2201). -// -// For our PTOAS-generated kernels we don't rely on these types today, but the -// headers still mention them in templates/static_asserts. Provide minimal -// fallbacks to keep compilation working on dav-c220. -// --------------------------------------------------------------------------- -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include - -// AICore printf support is gated behind `--cce-enable-print` on some -// toolchains. When enabled, include the CCE print header so `cce::printf` -// resolves in device compilation. -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -// Some PTO-ISA types are only available in the __CCE_AICORE__ compilation -// path, but `bisheng -xcce` still performs a host-side parse pass. -// Provide minimal fallbacks only when the corresponding header wasn't -// pulled in by the selected arch implementation. -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vexp_kernel_2d(__gm__ float *v1, - __gm__ float *v2); - -void LaunchVexp_kernel_2d(float *v1, float *v2, void *stream) { - vexp_kernel_2d<<<1, nullptr, stream>>>((__gm__ float *)v1, (__gm__ float *)v2); -} diff --git a/test/vpto/cases/micro-op/unary-vector/vsqrt/main.cpp b/test/vpto/cases/micro-op/unary-vector/vsqrt/main.cpp deleted file mode 100644 index f864622ca..000000000 --- a/test/vpto/cases/micro-op/unary-vector/vsqrt/main.cpp +++ /dev/null @@ -1,122 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -This program is free software, you can redistribute it and/or modify it under the terms and conditions of -CANN Open Software License Agreement Version 2.0 (the "License"). -Please refer to the License for details. You may not use this file except in compliance with the License. -THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -See LICENSE in the root of the software repository for the full text of the License. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include -#include - -using namespace PtoTestCommon; - -#ifndef TMRGSORT_HPP -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, (int)_ret, __FILE__, __LINE__); \ - const char *_recent = aclGetRecentErrMsg(); \ - if (_recent != nullptr && _recent[0] != '\0') { \ - std::fprintf(stderr, "[ERROR] RecentErrMsg: %s\n", _recent); \ - } \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVexp_kernel_2d(float *v1, float *v2, void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(float); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(float); - float *v1Host = nullptr; - float *v1Device = nullptr; - float *v2Host = nullptr; - float *v2Device = nullptr; - - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { - deviceId = std::atoi(envDevice); - } - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVexp_kernel_2d(v1Device, v2Device, stream); - - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v2Host, fileSize_v2, v2Device, fileSize_v2, ACL_MEMCPY_DEVICE_TO_HOST)); - - WriteFile("./v2.bin", v2Host, fileSize_v2); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - if (stream != nullptr) { - const aclError _ret = aclrtDestroyStream(stream); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtDestroyStream(stream)", (int)_ret, __FILE__, __LINE__); - } - stream = nullptr; - } - if (deviceSet) { - const aclError _ret = aclrtResetDevice(deviceId); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtResetDevice(deviceId)", (int)_ret, __FILE__, __LINE__); - } - } - if (aclInited) { - const aclError _ret = aclFinalize(); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclFinalize()", (int)_ret, __FILE__, __LINE__); - } - } - - return rc; -} diff --git a/test/vpto/cases/micro-op/vec-scalar/vaddcs-carry-boundary/compare.py b/test/vpto/cases/micro-op/vec-scalar/vaddcs-carry-boundary/compare.py deleted file mode 100644 index b5dd9902e..000000000 --- a/test/vpto/cases/micro-op/vec-scalar/vaddcs-carry-boundary/compare.py +++ /dev/null @@ -1,64 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/vec-scalar/vaddcs-carry-boundary -# family: vec-scalar -# target_ops: pto.vaddcs -# scenarios: core-u32-unsigned, full-mask, carry-chain, integer-overflow - -import os -import sys -import numpy as np - - -REPEAT_BYTES = 256 -LOGICAL_ELEMS = 64 -SRC_ELEM_BYTES = 4 - - -def _ceil_div(x, y): - return (x + y - 1) // y - - -def _packed_pred_storage_bytes(logical_elems, src_elem_bytes): - repeat_elems = REPEAT_BYTES // src_elem_bytes - repeat_times = _ceil_div(logical_elems, repeat_elems) + 1 - loop_count = repeat_times // 2 - return loop_count * 16 - - -def compare_result(): - golden = np.fromfile("golden_v3.bin", dtype=np.uint32, count=64) - output = np.fromfile("v3.bin", dtype=np.uint32, count=64) - return golden.shape == output.shape and np.array_equal(golden, output) - - -def compare_carry(): - prefix_bytes = _packed_pred_storage_bytes(LOGICAL_ELEMS, SRC_ELEM_BYTES) - golden = np.fromfile("golden_v4.bin", dtype=np.uint8) - output = np.fromfile("v4.bin", dtype=np.uint8) - if golden.size < prefix_bytes or output.size < prefix_bytes: - return False - return np.array_equal(golden[:prefix_bytes], output[:prefix_bytes]) - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_result() and compare_carry() - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/vec-scalar/vaddcs-carry-boundary/golden.py b/test/vpto/cases/micro-op/vec-scalar/vaddcs-carry-boundary/golden.py deleted file mode 100644 index ddf74542c..000000000 --- a/test/vpto/cases/micro-op/vec-scalar/vaddcs-carry-boundary/golden.py +++ /dev/null @@ -1,72 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/vec-scalar/vaddcs-carry-boundary -# family: vec-scalar -# target_ops: pto.vaddcs -# scenarios: core-u32-unsigned, full-mask, carry-chain, integer-overflow - -import argparse -from pathlib import Path - -import numpy as np - - -LANES = 64 -LHS_PATTERN = np.array( - [0x00000000, 0x00000001, 0xFFFFFFFE, 0xFFFFFFFF, 0x7FFFFFFF, 0x80000000, 0xAAAAAAAA, 0x55555555], - dtype=np.uint32, -) -RHS_PATTERN = np.array( - [0x00000000, 0xFFFFFFFF, 0x00000001, 0x00000000, 0x80000000, 0x7FFFFFFF, 0x55555555, 0xAAAAAAAA], - dtype=np.uint32, -) - - -def pack_mask_nibbles(bits): - out = np.zeros(256, dtype=np.uint8) - for idx, bit in enumerate(bits): - if not bit: - continue - byte = idx // 2 - if idx % 2 == 0: - out[byte] |= np.uint8(0x1) - else: - out[byte] |= np.uint8(0x10) - return out - - -def generate(output_dir: Path, seed: int) -> None: - del seed - repeats = LANES // LHS_PATTERN.size - lhs = np.tile(LHS_PATTERN, repeats) - rhs = np.tile(RHS_PATTERN, repeats) - total = lhs.astype(np.uint64) + rhs.astype(np.uint64) + np.uint64(1) - result = (total & np.uint64(0xFFFFFFFF)).astype(np.uint32) - carry = (total >> np.uint64(32)) != 0 - - output_dir.mkdir(parents=True, exist_ok=True) - lhs.tofile(output_dir / "v1.bin") - rhs.tofile(output_dir / "v2.bin") - np.zeros(LANES, dtype=np.uint32).tofile(output_dir / "v3.bin") - np.zeros(256, dtype=np.uint8).tofile(output_dir / "v4.bin") - result.tofile(output_dir / "golden_v3.bin") - pack_mask_nibbles(carry).tofile(output_dir / "golden_v4.bin") - - -def main() -> None: - parser = argparse.ArgumentParser() - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=19) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/vec-scalar/vaddcs-carry-boundary/kernel.pto b/test/vpto/cases/micro-op/vec-scalar/vaddcs-carry-boundary/kernel.pto deleted file mode 100644 index cd5a5748d..000000000 --- a/test/vpto/cases/micro-op/vec-scalar/vaddcs-carry-boundary/kernel.pto +++ /dev/null @@ -1,57 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/vec-scalar/vaddcs-carry-boundary -// family: vec-scalar -// target_ops: pto.vaddcs -// scenarios: core-u32-unsigned, full-mask, carry-chain, integer-overflow -// ----------------------------------------------------------------------------- -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vaddcs_carry_boundary_kernel(%arg0: !pto.ptr, - %arg1: !pto.ptr, - %arg2: !pto.ptr, - %arg3: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c128_i64 = arith.constant 128 : i64 - %c256_i64 = arith.constant 256 : i64 - %c4096_i64 = arith.constant 4096 : i64 - %c8192_i64 = arith.constant 8192 : i64 - %c12288_i64 = arith.constant 12288 : i64 - %false = arith.constant false - - %ub_lhs = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_rhs = pto.castptr %c4096_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c8192_i64 : i64 -> !pto.ptr - %ub_carry = pto.castptr %c12288_i64 : i64 -> !pto.ptr - pto.mte_gm_ub %arg0, %ub_lhs, %c0_i64, %c256_i64 - nburst(%c1_i64, %c256_i64, %c256_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - pto.mte_gm_ub %arg1, %ub_rhs, %c0_i64, %c256_i64 - nburst(%c1_i64, %c256_i64, %c256_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %mask = pto.pset_b32 "PAT_ALL" : !pto.mask - %carry_in = pto.pset_b32 "PAT_ALL" : !pto.mask - %lhs = pto.vlds %ub_lhs[%c0] : !pto.ptr -> !pto.vreg<64xui32> - %rhs = pto.vlds %ub_rhs[%c0] : !pto.ptr -> !pto.vreg<64xui32> - %sum, %carry = pto.vaddcs %lhs, %rhs, %carry_in, %mask : !pto.vreg<64xui32>, !pto.vreg<64xui32>, !pto.mask, !pto.mask -> !pto.vreg<64xui32>, !pto.mask - pto.vsts %sum, %ub_out[%c0], %mask : !pto.vreg<64xui32>, !pto.ptr, !pto.mask - pto.psti %carry, %ub_carry[%c0], "NORM" : !pto.mask, !pto.ptr, index - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg2, %c256_i64 - nburst(%c1_i64, %c256_i64, %c256_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.mte_ub_gm %ub_carry, %arg3, %c128_i64 - nburst(%c1_i64, %c256_i64, %c256_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/vec-scalar/vaddcs-carry-boundary/launch.cpp b/test/vpto/cases/micro-op/vec-scalar/vaddcs-carry-boundary/launch.cpp deleted file mode 100644 index 209c04c6a..000000000 --- a/test/vpto/cases/micro-op/vec-scalar/vaddcs-carry-boundary/launch.cpp +++ /dev/null @@ -1,53 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/vec-scalar/vaddcs-carry-boundary -// family: vec-scalar -// target_ops: pto.vaddcs -// scenarios: core-u32-unsigned, full-mask, carry-chain, integer-overflow -// ----------------------------------------------------------------------------- -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vaddcs_carry_boundary_kernel( - __gm__ uint32_t *v1, __gm__ uint32_t *v2, __gm__ uint32_t *v3, - __gm__ uint8_t *v4); - -void LaunchVaddcsCarryBoundaryKernel(uint32_t *v1, uint32_t *v2, uint32_t *v3, - uint8_t *v4, void *stream) { - vaddcs_carry_boundary_kernel<<<1, nullptr, stream>>>( - (__gm__ uint32_t *)v1, (__gm__ uint32_t *)v2, (__gm__ uint32_t *)v3, - (__gm__ uint8_t *)v4); -} diff --git a/test/vpto/cases/micro-op/vec-scalar/vaddcs-carry-boundary/main.cpp b/test/vpto/cases/micro-op/vec-scalar/vaddcs-carry-boundary/main.cpp deleted file mode 100644 index 6addb079d..000000000 --- a/test/vpto/cases/micro-op/vec-scalar/vaddcs-carry-boundary/main.cpp +++ /dev/null @@ -1,115 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/vec-scalar/vaddcs-carry-boundary -// family: vec-scalar -// target_ops: pto.vaddcs -// scenarios: core-u32-unsigned, full-mask, carry-chain, integer-overflow -// ----------------------------------------------------------------------------- -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include - -using namespace PtoTestCommon; - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, \ - (int)_ret, __FILE__, __LINE__); \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVaddcsCarryBoundaryKernel(uint32_t *v1, uint32_t *v2, uint32_t *v3, - uint8_t *v4, void *stream); - -int main() { - size_t elemCount_v1 = 64; - size_t fileSize_v1 = elemCount_v1 * sizeof(uint32_t); - size_t elemCount_v2 = 64; - size_t fileSize_v2 = elemCount_v2 * sizeof(uint32_t); - size_t elemCount_v3 = 64; - size_t fileSize_v3 = elemCount_v3 * sizeof(uint32_t); - size_t elemCount_v4 = 256; - size_t fileSize_v4 = elemCount_v4 * sizeof(uint8_t); - uint32_t *v1Host = nullptr; - uint32_t *v1Device = nullptr; - uint32_t *v2Host = nullptr; - uint32_t *v2Device = nullptr; - uint32_t *v3Host = nullptr; - uint32_t *v3Device = nullptr; - uint8_t *v4Host = nullptr; - uint8_t *v4Device = nullptr; - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) - deviceId = std::atoi(envDevice); - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMallocHost((void **)(&v3Host), fileSize_v3)); - ACL_CHECK(aclrtMallocHost((void **)(&v4Host), fileSize_v4)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v3Device, fileSize_v3, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v4Device, fileSize_v4, ACL_MEM_MALLOC_HUGE_FIRST)); - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ReadFile("./v3.bin", fileSize_v3, v3Host, fileSize_v3); - ReadFile("./v4.bin", fileSize_v4, v4Host, fileSize_v4); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v3Device, fileSize_v3, v3Host, fileSize_v3, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v4Device, fileSize_v4, v4Host, fileSize_v4, - ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVaddcsCarryBoundaryKernel(v1Device, v2Device, v3Device, v4Device, stream); - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v3Host, fileSize_v3, v3Device, fileSize_v3, - ACL_MEMCPY_DEVICE_TO_HOST)); - ACL_CHECK(aclrtMemcpy(v4Host, fileSize_v4, v4Device, fileSize_v4, - ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v3.bin", v3Host, fileSize_v3); - WriteFile("./v4.bin", v4Host, fileSize_v4); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFree(v3Device); - aclrtFree(v4Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - aclrtFreeHost(v3Host); - aclrtFreeHost(v4Host); - if (stream != nullptr) - aclrtDestroyStream(stream); - if (deviceSet) - aclrtResetDevice(deviceId); - if (aclInited) - aclFinalize(); - return rc; -} diff --git a/test/vpto/cases/micro-op/vec-scalar/vaddcs/kernel.pto b/test/vpto/cases/micro-op/vec-scalar/vaddcs/kernel.pto index bf71e2b04..6bba75cd3 100644 --- a/test/vpto/cases/micro-op/vec-scalar/vaddcs/kernel.pto +++ b/test/vpto/cases/micro-op/vec-scalar/vaddcs/kernel.pto @@ -50,6 +50,151 @@ module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind, !pto.ptr, i64, i64, i64, i64 pto.barrier #pto.pipe + %__case_merge_guard = arith.constant false + + // inactive cross-case merge from micro-op/vec-scalar/vaddcs-carry-boundary + scf.if %__case_merge_guard { + + %c0_cmg17_1 = arith.constant 0 : index + %c0_i64_cmg17_1 = arith.constant 0 : i64 + %c1_i64_cmg17_1 = arith.constant 1 : i64 + %c128_i64_cmg17_1 = arith.constant 128 : i64 + %c256_i64_cmg17_1 = arith.constant 256 : i64 + %c4096_i64_cmg17_1 = arith.constant 4096 : i64 + %c8192_i64_cmg17_1 = arith.constant 8192 : i64 + %c12288_i64_cmg17_1 = arith.constant 12288 : i64 + %false_cmg17_1 = arith.constant false + + %ub_lhs_cmg17_1 = pto.castptr %c0_i64_cmg17_1 : i64 -> !pto.ptr + %ub_rhs_cmg17_1 = pto.castptr %c4096_i64_cmg17_1 : i64 -> !pto.ptr + %ub_out_cmg17_1 = pto.castptr %c8192_i64_cmg17_1 : i64 -> !pto.ptr + %ub_carry_cmg17_1 = pto.castptr %c12288_i64_cmg17_1 : i64 -> !pto.ptr + pto.mte_gm_ub %arg0, %ub_lhs_cmg17_1, %c0_i64_cmg17_1, %c256_i64_cmg17_1 + nburst(%c1_i64_cmg17_1, %c256_i64_cmg17_1, %c256_i64_cmg17_1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + pto.mte_gm_ub %arg1, %ub_rhs_cmg17_1, %c0_i64_cmg17_1, %c256_i64_cmg17_1 + nburst(%c1_i64_cmg17_1, %c256_i64_cmg17_1, %c256_i64_cmg17_1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %mask_cmg17_1 = pto.pset_b32 "PAT_ALL" : !pto.mask + %carry_in_cmg17_1 = pto.pset_b32 "PAT_ALL" : !pto.mask + %lhs_cmg17_1 = pto.vlds %ub_lhs_cmg17_1[%c0_cmg17_1] : !pto.ptr -> !pto.vreg<64xui32> + %rhs_cmg17_1 = pto.vlds %ub_rhs_cmg17_1[%c0_cmg17_1] : !pto.ptr -> !pto.vreg<64xui32> + %sum_cmg17_1, %carry_cmg17_1 = pto.vaddcs %lhs_cmg17_1, %rhs_cmg17_1, %carry_in_cmg17_1, %mask_cmg17_1 : !pto.vreg<64xui32>, !pto.vreg<64xui32>, !pto.mask, !pto.mask -> !pto.vreg<64xui32>, !pto.mask + pto.vsts %sum_cmg17_1, %ub_out_cmg17_1[%c0_cmg17_1], %mask_cmg17_1 : !pto.vreg<64xui32>, !pto.ptr, !pto.mask + pto.psti %carry_cmg17_1, %ub_carry_cmg17_1[%c0_cmg17_1], "NORM" : !pto.mask, !pto.ptr, index + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_cmg17_1, %arg2, %c256_i64_cmg17_1 + nburst(%c1_i64_cmg17_1, %c256_i64_cmg17_1, %c256_i64_cmg17_1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.mte_ub_gm %ub_carry_cmg17_1, %arg3, %c128_i64_cmg17_1 + nburst(%c1_i64_cmg17_1, %c256_i64_cmg17_1, %c256_i64_cmg17_1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + } + + // inactive cross-case merge from micro-op/vec-scalar/vsubcs + scf.if %__case_merge_guard { + + %c0_cmg17_2 = arith.constant 0 : index + %c0_i64_cmg17_2 = arith.constant 0 : i64 + %c1_i64_cmg17_2 = arith.constant 1 : i64 + %c128_i64_cmg17_2 = arith.constant 128 : i64 + %c256_i64_cmg17_2 = arith.constant 256 : i64 + %c4096_i64_cmg17_2 = arith.constant 4096 : i64 + %c8192_i64_cmg17_2 = arith.constant 8192 : i64 + %c12288_i64_cmg17_2 = arith.constant 12288 : i64 + %false_cmg17_2 = arith.constant false + + %ub_lhs_cmg17_2 = pto.castptr %c0_i64_cmg17_2 : i64 -> !pto.ptr + %ub_rhs_cmg17_2 = pto.castptr %c4096_i64_cmg17_2 : i64 -> !pto.ptr + %ub_out_cmg17_2 = pto.castptr %c8192_i64_cmg17_2 : i64 -> !pto.ptr + %ub_borrow_cmg17_2 = pto.castptr %c12288_i64_cmg17_2 : i64 -> !pto.ptr + pto.mte_gm_ub %arg0, %ub_lhs_cmg17_2, %c0_i64_cmg17_2, %c256_i64_cmg17_2 + nburst(%c1_i64_cmg17_2, %c256_i64_cmg17_2, %c256_i64_cmg17_2) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + pto.mte_gm_ub %arg1, %ub_rhs_cmg17_2, %c0_i64_cmg17_2, %c256_i64_cmg17_2 + nburst(%c1_i64_cmg17_2, %c256_i64_cmg17_2, %c256_i64_cmg17_2) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %mask_cmg17_2 = pto.pset_b32 "PAT_ALL" : !pto.mask + %borrow_in_cmg17_2 = pto.pset_b32 "PAT_ALL" : !pto.mask + %lhs_cmg17_2 = pto.vlds %ub_lhs_cmg17_2[%c0_cmg17_2] : !pto.ptr -> !pto.vreg<64xui32> + %rhs_cmg17_2 = pto.vlds %ub_rhs_cmg17_2[%c0_cmg17_2] : !pto.ptr -> !pto.vreg<64xui32> + %diff_cmg17_2, %borrow_cmg17_2 = pto.vsubcs %lhs_cmg17_2, %rhs_cmg17_2, %borrow_in_cmg17_2, %mask_cmg17_2 : !pto.vreg<64xui32>, !pto.vreg<64xui32>, !pto.mask, !pto.mask -> !pto.vreg<64xui32>, !pto.mask + pto.vsts %diff_cmg17_2, %ub_out_cmg17_2[%c0_cmg17_2], %mask_cmg17_2 : !pto.vreg<64xui32>, !pto.ptr, !pto.mask + pto.psti %borrow_cmg17_2, %ub_borrow_cmg17_2[%c0_cmg17_2], "NORM" : !pto.mask, !pto.ptr, index + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_cmg17_2, %arg2, %c256_i64_cmg17_2 + nburst(%c1_i64_cmg17_2, %c256_i64_cmg17_2, %c256_i64_cmg17_2) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.mte_ub_gm %ub_borrow_cmg17_2, %arg3, %c128_i64_cmg17_2 + nburst(%c1_i64_cmg17_2, %c256_i64_cmg17_2, %c256_i64_cmg17_2) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + } + + // inactive cross-case merge from micro-op/vec-scalar/vsubcs-borrow-boundary + scf.if %__case_merge_guard { + + %c0_cmg17_3 = arith.constant 0 : index + %c0_i64_cmg17_3 = arith.constant 0 : i64 + %c1_i64_cmg17_3 = arith.constant 1 : i64 + %c128_i64_cmg17_3 = arith.constant 128 : i64 + %c256_i64_cmg17_3 = arith.constant 256 : i64 + %c4096_i64_cmg17_3 = arith.constant 4096 : i64 + %c8192_i64_cmg17_3 = arith.constant 8192 : i64 + %c12288_i64_cmg17_3 = arith.constant 12288 : i64 + %false_cmg17_3 = arith.constant false + + %ub_lhs_cmg17_3 = pto.castptr %c0_i64_cmg17_3 : i64 -> !pto.ptr + %ub_rhs_cmg17_3 = pto.castptr %c4096_i64_cmg17_3 : i64 -> !pto.ptr + %ub_out_cmg17_3 = pto.castptr %c8192_i64_cmg17_3 : i64 -> !pto.ptr + %ub_borrow_cmg17_3 = pto.castptr %c12288_i64_cmg17_3 : i64 -> !pto.ptr + pto.mte_gm_ub %arg0, %ub_lhs_cmg17_3, %c0_i64_cmg17_3, %c256_i64_cmg17_3 + nburst(%c1_i64_cmg17_3, %c256_i64_cmg17_3, %c256_i64_cmg17_3) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + pto.mte_gm_ub %arg1, %ub_rhs_cmg17_3, %c0_i64_cmg17_3, %c256_i64_cmg17_3 + nburst(%c1_i64_cmg17_3, %c256_i64_cmg17_3, %c256_i64_cmg17_3) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %mask_cmg17_3 = pto.pset_b32 "PAT_ALL" : !pto.mask + %borrow_in_cmg17_3 = pto.pset_b32 "PAT_ALL" : !pto.mask + %lhs_cmg17_3 = pto.vlds %ub_lhs_cmg17_3[%c0_cmg17_3] : !pto.ptr -> !pto.vreg<64xui32> + %rhs_cmg17_3 = pto.vlds %ub_rhs_cmg17_3[%c0_cmg17_3] : !pto.ptr -> !pto.vreg<64xui32> + %diff_cmg17_3, %borrow_cmg17_3 = pto.vsubcs %lhs_cmg17_3, %rhs_cmg17_3, %borrow_in_cmg17_3, %mask_cmg17_3 : !pto.vreg<64xui32>, !pto.vreg<64xui32>, !pto.mask, !pto.mask -> !pto.vreg<64xui32>, !pto.mask + pto.vsts %diff_cmg17_3, %ub_out_cmg17_3[%c0_cmg17_3], %mask_cmg17_3 : !pto.vreg<64xui32>, !pto.ptr, !pto.mask + pto.psti %borrow_cmg17_3, %ub_borrow_cmg17_3[%c0_cmg17_3], "NORM" : !pto.mask, !pto.ptr, index + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_cmg17_3, %arg2, %c256_i64_cmg17_3 + nburst(%c1_i64_cmg17_3, %c256_i64_cmg17_3, %c256_i64_cmg17_3) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.mte_ub_gm %ub_borrow_cmg17_3, %arg3, %c128_i64_cmg17_3 + nburst(%c1_i64_cmg17_3, %c256_i64_cmg17_3, %c256_i64_cmg17_3) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + } return } } diff --git a/test/vpto/cases/micro-op/vec-scalar/vadds-bf16/compare.py b/test/vpto/cases/micro-op/vec-scalar/vadds-bf16/compare.py deleted file mode 100644 index 896c992b2..000000000 --- a/test/vpto/cases/micro-op/vec-scalar/vadds-bf16/compare.py +++ /dev/null @@ -1,37 +0,0 @@ -#!/usr/bin/python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - - -import os -import sys -import numpy as np - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(golden_path) or not os.path.exists(output_path): - return False - golden = np.fromfile(golden_path, dtype=dtype) - output = np.fromfile(output_path, dtype=dtype) - return golden.shape == output.shape and np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True) - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_bin("golden_v2.bin", "v2.bin", np.uint16, 0) - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/vec-scalar/vadds-bf16/golden.py b/test/vpto/cases/micro-op/vec-scalar/vadds-bf16/golden.py deleted file mode 100644 index 0efc4ec38..000000000 --- a/test/vpto/cases/micro-op/vec-scalar/vadds-bf16/golden.py +++ /dev/null @@ -1,57 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -ELEMS = 1024 -SEED = 19 -SCALE = np.float32(1.5) - - -def f32_to_bf16_bits(values: np.ndarray) -> np.ndarray: - wide = values.astype(np.float32, copy=False).view(np.uint32) - rounding = np.uint32(0x7FFF) + ((wide >> 16) & np.uint32(1)) - return ((wide + rounding) >> 16).astype(np.uint16) - - -def bf16_bits_to_f32(bits: np.ndarray) -> np.ndarray: - return (bits.astype(np.uint32) << 16).view(np.float32) - - -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) - v1_f32 = rng.uniform(-4.0, 4.0, size=ELEMS).astype(np.float32) - v1 = f32_to_bf16_bits(v1_f32) - v2 = np.zeros(ELEMS, dtype=np.uint16) - scalar_bits = f32_to_bf16_bits(np.array([SCALE], dtype=np.float32))[0] - scalar = bf16_bits_to_f32(np.array([scalar_bits], dtype=np.uint16))[0] - golden_v2 = f32_to_bf16_bits(bf16_bits_to_f32(v1) + scalar) - - output_dir.mkdir(parents=True, exist_ok=True) - v1.tofile(output_dir / "v1.bin") - v2.tofile(output_dir / "v2.bin") - golden_v2.tofile(output_dir / "golden_v2.bin") - - -def main() -> None: - parser = argparse.ArgumentParser() - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/vec-scalar/vadds-bf16/kernel.pto b/test/vpto/cases/micro-op/vec-scalar/vadds-bf16/kernel.pto deleted file mode 100644 index 3ff454ee1..000000000 --- a/test/vpto/cases/micro-op/vec-scalar/vadds-bf16/kernel.pto +++ /dev/null @@ -1,42 +0,0 @@ -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vadds_bf16_kernel(%arg0: !pto.ptr, - %arg1: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c128 = arith.constant 128 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c64_i64 = arith.constant 64 : i64 - %c2048_i64 = arith.constant 2048 : i64 - %cst = arith.constant 1.500000e+00 : bf16 - - %ub_in = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c2048_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_in, %c0_i64, %c64_i64 - nburst(%c32_i64, %c64_i64, %c64_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %mask = pto.pset_b16 "PAT_ALL" : !pto.mask - scf.for %offset = %c0 to %c1024 step %c128 { - %vec = pto.vlds %ub_in[%offset] : !pto.ptr -> !pto.vreg<128xbf16> - %sum = pto.vadds %vec, %cst, %mask : !pto.vreg<128xbf16>, bf16, !pto.mask -> !pto.vreg<128xbf16> - pto.vsts %sum, %ub_out[%offset], %mask : !pto.vreg<128xbf16>, !pto.ptr, !pto.mask - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg1, %c64_i64 - nburst(%c32_i64, %c64_i64, %c64_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/vec-scalar/vadds-bf16/launch.cpp b/test/vpto/cases/micro-op/vec-scalar/vadds-bf16/launch.cpp deleted file mode 100644 index 734c5e95a..000000000 --- a/test/vpto/cases/micro-op/vec-scalar/vadds-bf16/launch.cpp +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vadds_bf16_kernel(__gm__ bfloat16_t *v1, - __gm__ bfloat16_t *v2); - -void LaunchVadds_bf16_kernel(uint16_t *v1, uint16_t *v2, void *stream) { - vadds_bf16_kernel<<<1, nullptr, stream>>>((__gm__ bfloat16_t *)v1, - (__gm__ bfloat16_t *)v2); -} diff --git a/test/vpto/cases/micro-op/vec-scalar/vadds-bf16/main.cpp b/test/vpto/cases/micro-op/vec-scalar/vadds-bf16/main.cpp deleted file mode 100644 index ce2c7d7bf..000000000 --- a/test/vpto/cases/micro-op/vec-scalar/vadds-bf16/main.cpp +++ /dev/null @@ -1,84 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include -#include - -using namespace PtoTestCommon; - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, \ - (int)_ret, __FILE__, __LINE__); \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVadds_bf16_kernel(uint16_t *v1, uint16_t *v2, void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(uint16_t); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(uint16_t); - uint16_t *v1Host = nullptr; - uint16_t *v1Device = nullptr; - uint16_t *v2Host = nullptr; - uint16_t *v2Device = nullptr; - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) - deviceId = std::atoi(envDevice); - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, - ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVadds_bf16_kernel(v1Device, v2Device, stream); - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v2Host, fileSize_v2, v2Device, fileSize_v2, - ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v2.bin", v2Host, fileSize_v2); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - if (stream != nullptr) - aclrtDestroyStream(stream); - if (deviceSet) - aclrtResetDevice(deviceId); - if (aclInited) - aclFinalize(); - return rc; -} diff --git a/test/vpto/cases/micro-op/vec-scalar/vadds-f16/compare.py b/test/vpto/cases/micro-op/vec-scalar/vadds-f16/compare.py deleted file mode 100644 index 1b47ca433..000000000 --- a/test/vpto/cases/micro-op/vec-scalar/vadds-f16/compare.py +++ /dev/null @@ -1,37 +0,0 @@ -#!/usr/bin/python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - - -import os -import sys -import numpy as np - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(golden_path) or not os.path.exists(output_path): - return False - golden = np.fromfile(golden_path, dtype=dtype) - output = np.fromfile(output_path, dtype=dtype) - return golden.shape == output.shape and np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True) - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_bin("golden_v2.bin", "v2.bin", np.float16, 5e-3) - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/vec-scalar/vadds-f16/golden.py b/test/vpto/cases/micro-op/vec-scalar/vadds-f16/golden.py deleted file mode 100644 index 019cf6980..000000000 --- a/test/vpto/cases/micro-op/vec-scalar/vadds-f16/golden.py +++ /dev/null @@ -1,44 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -ELEMS = 1024 -SEED = 19 -SCALE = np.float16(1.5) - - -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) - v1 = rng.uniform(-4.0, 4.0, size=ELEMS).astype(np.float16) - v2 = np.zeros(ELEMS, dtype=np.float16) - golden_v2 = (v1.astype(np.float32) + np.float32(SCALE)).astype(np.float16) - - output_dir.mkdir(parents=True, exist_ok=True) - v1.tofile(output_dir / "v1.bin") - v2.tofile(output_dir / "v2.bin") - golden_v2.tofile(output_dir / "golden_v2.bin") - - -def main() -> None: - parser = argparse.ArgumentParser() - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/vec-scalar/vadds-f16/kernel.pto b/test/vpto/cases/micro-op/vec-scalar/vadds-f16/kernel.pto deleted file mode 100644 index 798b9e92a..000000000 --- a/test/vpto/cases/micro-op/vec-scalar/vadds-f16/kernel.pto +++ /dev/null @@ -1,42 +0,0 @@ -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vadds_f16_kernel(%arg0: !pto.ptr, - %arg1: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c128 = arith.constant 128 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c64_i64 = arith.constant 64 : i64 - %c2048_i64 = arith.constant 2048 : i64 - %cst = arith.constant 1.500000e+00 : f16 - - %ub_in = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c2048_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_in, %c0_i64, %c64_i64 - nburst(%c32_i64, %c64_i64, %c64_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %mask = pto.pset_b16 "PAT_ALL" : !pto.mask - scf.for %offset = %c0 to %c1024 step %c128 { - %vec = pto.vlds %ub_in[%offset] : !pto.ptr -> !pto.vreg<128xf16> - %sum = pto.vadds %vec, %cst, %mask : !pto.vreg<128xf16>, f16, !pto.mask -> !pto.vreg<128xf16> - pto.vsts %sum, %ub_out[%offset], %mask : !pto.vreg<128xf16>, !pto.ptr, !pto.mask - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg1, %c64_i64 - nburst(%c32_i64, %c64_i64, %c64_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/vec-scalar/vadds-f16/launch.cpp b/test/vpto/cases/micro-op/vec-scalar/vadds-f16/launch.cpp deleted file mode 100644 index e964f1539..000000000 --- a/test/vpto/cases/micro-op/vec-scalar/vadds-f16/launch.cpp +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vadds_f16_kernel(__gm__ half *v1, - __gm__ half *v2); - -void LaunchVadds_f16_kernel(uint16_t *v1, uint16_t *v2, void *stream) { - vadds_f16_kernel<<<1, nullptr, stream>>>((__gm__ half *)v1, - (__gm__ half *)v2); -} diff --git a/test/vpto/cases/micro-op/vec-scalar/vadds-f16/main.cpp b/test/vpto/cases/micro-op/vec-scalar/vadds-f16/main.cpp deleted file mode 100644 index 0e9c0076c..000000000 --- a/test/vpto/cases/micro-op/vec-scalar/vadds-f16/main.cpp +++ /dev/null @@ -1,84 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include -#include - -using namespace PtoTestCommon; - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, \ - (int)_ret, __FILE__, __LINE__); \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVadds_f16_kernel(uint16_t *v1, uint16_t *v2, void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(uint16_t); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(uint16_t); - uint16_t *v1Host = nullptr; - uint16_t *v1Device = nullptr; - uint16_t *v2Host = nullptr; - uint16_t *v2Device = nullptr; - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) - deviceId = std::atoi(envDevice); - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, - ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVadds_f16_kernel(v1Device, v2Device, stream); - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v2Host, fileSize_v2, v2Device, fileSize_v2, - ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v2.bin", v2Host, fileSize_v2); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - if (stream != nullptr) - aclrtDestroyStream(stream); - if (deviceSet) - aclrtResetDevice(deviceId); - if (aclInited) - aclFinalize(); - return rc; -} diff --git a/test/vpto/cases/micro-op/vec-scalar/vadds-f32-exceptional/compare.py b/test/vpto/cases/micro-op/vec-scalar/vadds-f32-exceptional/compare.py deleted file mode 100644 index 15b793fac..000000000 --- a/test/vpto/cases/micro-op/vec-scalar/vadds-f32-exceptional/compare.py +++ /dev/null @@ -1,37 +0,0 @@ -#!/usr/bin/python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - - -import os -import sys -import numpy as np - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(golden_path) or not os.path.exists(output_path): - return False - golden = np.fromfile(golden_path, dtype=dtype) - output = np.fromfile(output_path, dtype=dtype) - return golden.shape == output.shape and np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True) - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_bin("golden_v2.bin", "v2.bin", np.float32, 1e-4) - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/vec-scalar/vadds-f32-exceptional/golden.py b/test/vpto/cases/micro-op/vec-scalar/vadds-f32-exceptional/golden.py deleted file mode 100644 index c101038fb..000000000 --- a/test/vpto/cases/micro-op/vec-scalar/vadds-f32-exceptional/golden.py +++ /dev/null @@ -1,49 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -ROWS = 32 -COLS = 32 -SEED = 19 -SCALE = np.float32(0.5) - - -def generate(output_dir: Path, seed: int) -> None: - del seed - specials = np.array( - [-np.inf, -1.0, -0.0, 0.0, 1.0, np.inf, np.nan, 3.5], - dtype=np.float32, - ) - v1 = np.resize(specials, ROWS * COLS).reshape(ROWS, COLS).astype(np.float32) - v2 = np.zeros((ROWS, COLS), dtype=np.float32) - golden_v2 = (v1 + SCALE).astype(np.float32, copy=False) - - output_dir.mkdir(parents=True, exist_ok=True) - v1.reshape(-1).tofile(output_dir / "v1.bin") - v2.reshape(-1).tofile(output_dir / "v2.bin") - golden_v2.reshape(-1).tofile(output_dir / "golden_v2.bin") - - -def main() -> None: - parser = argparse.ArgumentParser() - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/vec-scalar/vadds-f32-exceptional/kernel.pto b/test/vpto/cases/micro-op/vec-scalar/vadds-f32-exceptional/kernel.pto deleted file mode 100644 index 7c074f34e..000000000 --- a/test/vpto/cases/micro-op/vec-scalar/vadds-f32-exceptional/kernel.pto +++ /dev/null @@ -1,44 +0,0 @@ -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vadds_f32_exceptional_kernel_2d(%arg0: !pto.ptr, - %arg1: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c64 = arith.constant 64 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c128_i64 = arith.constant 128 : i64 - %c4096_i64 = arith.constant 4096 : i64 - %c1024_i32 = arith.constant 1024 : i32 - %cst = arith.constant 5.000000e-01 : f32 - - %ub_in = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c4096_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_in, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %_:1 = scf.for %offset = %c0 to %c1024 step %c64 iter_args(%remaining = %c1024_i32) -> (i32) { - %mask, %next_remaining = pto.plt_b32 %remaining : i32 -> !pto.mask, i32 - %vec = pto.vlds %ub_in[%offset] : !pto.ptr -> !pto.vreg<64xf32> - %sum = pto.vadds %vec, %cst, %mask : !pto.vreg<64xf32>, f32, !pto.mask -> !pto.vreg<64xf32> - pto.vsts %sum, %ub_out[%offset], %mask : !pto.vreg<64xf32>, !pto.ptr, !pto.mask - scf.yield %next_remaining : i32 - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg1, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/vec-scalar/vadds-f32-exceptional/launch.cpp b/test/vpto/cases/micro-op/vec-scalar/vadds-f32-exceptional/launch.cpp deleted file mode 100644 index 915da93e1..000000000 --- a/test/vpto/cases/micro-op/vec-scalar/vadds-f32-exceptional/launch.cpp +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vadds_f32_exceptional_kernel_2d(__gm__ float *v1, - __gm__ float *v2); - -void LaunchVadds_f32_exceptional_kernel_2d(float *v1, float *v2, void *stream) { - vadds_f32_exceptional_kernel_2d<<<1, nullptr, stream>>>((__gm__ float *)v1, - (__gm__ float *)v2); -} diff --git a/test/vpto/cases/micro-op/vec-scalar/vadds-f32-exceptional/main.cpp b/test/vpto/cases/micro-op/vec-scalar/vadds-f32-exceptional/main.cpp deleted file mode 100644 index 9ba910e1c..000000000 --- a/test/vpto/cases/micro-op/vec-scalar/vadds-f32-exceptional/main.cpp +++ /dev/null @@ -1,83 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include - -using namespace PtoTestCommon; - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, \ - (int)_ret, __FILE__, __LINE__); \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVadds_f32_exceptional_kernel_2d(float *v1, float *v2, void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(float); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(float); - float *v1Host = nullptr; - float *v1Device = nullptr; - float *v2Host = nullptr; - float *v2Device = nullptr; - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) - deviceId = std::atoi(envDevice); - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, - ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVadds_f32_exceptional_kernel_2d(v1Device, v2Device, stream); - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v2Host, fileSize_v2, v2Device, fileSize_v2, - ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v2.bin", v2Host, fileSize_v2); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - if (stream != nullptr) - aclrtDestroyStream(stream); - if (deviceSet) - aclrtResetDevice(deviceId); - if (aclInited) - aclFinalize(); - return rc; -} diff --git a/test/vpto/cases/micro-op/vec-scalar/vadds-i16-signed-overflow/compare.py b/test/vpto/cases/micro-op/vec-scalar/vadds-i16-signed-overflow/compare.py deleted file mode 100644 index 3402d0a6d..000000000 --- a/test/vpto/cases/micro-op/vec-scalar/vadds-i16-signed-overflow/compare.py +++ /dev/null @@ -1,42 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/vec-scalar/vadds-i16-signed-overflow -# family: vec-scalar -# target_ops: pto.vadds -# scenarios: core-i16-signed, full-mask, scalar-operand, integer-overflow - -import os -import sys - -import numpy as np - - -def compare_bin(golden_path: str, output_path: str, dtype) -> bool: - if not os.path.exists(golden_path) or not os.path.exists(output_path): - return False - golden = np.fromfile(golden_path, dtype=dtype) - output = np.fromfile(output_path, dtype=dtype) - return golden.shape == output.shape and np.array_equal(golden, output) - - -def main() -> None: - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_bin("golden_v2.bin", "v2.bin", np.int16) - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/vec-scalar/vadds-i16-signed-overflow/golden.py b/test/vpto/cases/micro-op/vec-scalar/vadds-i16-signed-overflow/golden.py deleted file mode 100644 index fcf4c5afb..000000000 --- a/test/vpto/cases/micro-op/vec-scalar/vadds-i16-signed-overflow/golden.py +++ /dev/null @@ -1,64 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/vec-scalar/vadds-i16-signed-overflow -# family: vec-scalar -# target_ops: pto.vadds -# scenarios: core-i16-signed, full-mask, scalar-operand, integer-overflow - -import argparse -from pathlib import Path - -import numpy as np - - -ELEMS = 1024 -SEED = 19 -SCALAR = np.int16(1024) - - -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) - v1 = rng.integers(-16000, 16000, size=ELEMS, dtype=np.int16) - v1[:12] = np.array( - [ - 32767, - 32766, - 32760, - 32000, - 0, - 1, - -1, - -32768, - -32767, - -32000, - 12345, - -12345, - ], - dtype=np.int16, - ) - v2 = np.zeros(ELEMS, dtype=np.int16) - golden_v2 = (v1.astype(np.int32) + int(SCALAR)).astype(np.int16) - - output_dir.mkdir(parents=True, exist_ok=True) - v1.tofile(output_dir / "v1.bin") - v2.tofile(output_dir / "v2.bin") - golden_v2.tofile(output_dir / "golden_v2.bin") - - -def main() -> None: - parser = argparse.ArgumentParser() - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/vec-scalar/vadds-i16-signed-overflow/kernel.pto b/test/vpto/cases/micro-op/vec-scalar/vadds-i16-signed-overflow/kernel.pto deleted file mode 100644 index 1fa3b62d1..000000000 --- a/test/vpto/cases/micro-op/vec-scalar/vadds-i16-signed-overflow/kernel.pto +++ /dev/null @@ -1,48 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/vec-scalar/vadds-i16-signed-overflow -// family: vec-scalar -// target_ops: pto.vadds -// scenarios: core-i16-signed, full-mask, scalar-operand, integer-overflow -// ----------------------------------------------------------------------------- -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vadds_i16_signed_overflow_kernel(%arg0: !pto.ptr, - %arg1: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c128 = arith.constant 128 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c64_i64 = arith.constant 64 : i64 - %c2048_i64 = arith.constant 2048 : i64 - %scalar = arith.constant 1024 : i16 - - %ub_in = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c2048_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_in, %c0_i64, %c64_i64 - nburst(%c32_i64, %c64_i64, %c64_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %mask = pto.pset_b16 "PAT_ALL" : !pto.mask - scf.for %offset = %c0 to %c1024 step %c128 { - %vec = pto.vlds %ub_in[%offset] : !pto.ptr -> !pto.vreg<128xi16> - %sum = pto.vadds %vec, %scalar, %mask : !pto.vreg<128xi16>, i16, !pto.mask -> !pto.vreg<128xi16> - pto.vsts %sum, %ub_out[%offset], %mask : !pto.vreg<128xi16>, !pto.ptr, !pto.mask - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg1, %c64_i64 - nburst(%c32_i64, %c64_i64, %c64_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/vec-scalar/vadds-i16-signed-overflow/launch.cpp b/test/vpto/cases/micro-op/vec-scalar/vadds-i16-signed-overflow/launch.cpp deleted file mode 100644 index 4ac3e8a59..000000000 --- a/test/vpto/cases/micro-op/vec-scalar/vadds-i16-signed-overflow/launch.cpp +++ /dev/null @@ -1,51 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/vec-scalar/vadds-i16-signed-overflow -// family: vec-scalar -// target_ops: pto.vadds -// scenarios: core-i16-signed, full-mask, scalar-operand, integer-overflow -// ----------------------------------------------------------------------------- -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void -vadds_i16_signed_overflow_kernel(__gm__ int16_t *v1, __gm__ int16_t *v2); - -void LaunchVadds_i16_signed_overflow_kernel(int16_t *v1, int16_t *v2, - void *stream) { - vadds_i16_signed_overflow_kernel<<<1, nullptr, stream>>>((__gm__ int16_t *)v1, - (__gm__ int16_t *)v2); -} diff --git a/test/vpto/cases/micro-op/vec-scalar/vadds-i16-signed-overflow/main.cpp b/test/vpto/cases/micro-op/vec-scalar/vadds-i16-signed-overflow/main.cpp deleted file mode 100644 index b8c63d7a8..000000000 --- a/test/vpto/cases/micro-op/vec-scalar/vadds-i16-signed-overflow/main.cpp +++ /dev/null @@ -1,81 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -#include "test_common.h" -#include "acl/acl.h" -#include -#include -#include - -using namespace PtoTestCommon; - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, \ - (int)_ret, __FILE__, __LINE__); \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVadds_i16_signed_overflow_kernel(int16_t *v1, int16_t *v2, - void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(int16_t); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(int16_t); - int16_t *v1Host = nullptr; - int16_t *v1Device = nullptr; - int16_t *v2Host = nullptr; - int16_t *v2Device = nullptr; - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) - deviceId = std::atoi(envDevice); - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, - ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVadds_i16_signed_overflow_kernel(v1Device, v2Device, stream); - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v2Host, fileSize_v2, v2Device, fileSize_v2, - ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v2.bin", v2Host, fileSize_v2); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - if (stream != nullptr) - aclrtDestroyStream(stream); - if (deviceSet) - aclrtResetDevice(deviceId); - if (aclInited) - aclFinalize(); - return rc; -} diff --git a/test/vpto/cases/micro-op/vec-scalar/vadds-i16-signed/compare.py b/test/vpto/cases/micro-op/vec-scalar/vadds-i16-signed/compare.py deleted file mode 100644 index 421ac84f5..000000000 --- a/test/vpto/cases/micro-op/vec-scalar/vadds-i16-signed/compare.py +++ /dev/null @@ -1,42 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/vec-scalar/vadds-i16-signed -# family: vec-scalar -# target_ops: pto.vadds -# scenarios: core-i16-signed, full-mask, scalar-operand - -import os -import sys - -import numpy as np - - -def compare_bin(golden_path: str, output_path: str, dtype) -> bool: - if not os.path.exists(golden_path) or not os.path.exists(output_path): - return False - golden = np.fromfile(golden_path, dtype=dtype) - output = np.fromfile(output_path, dtype=dtype) - return golden.shape == output.shape and np.array_equal(golden, output) - - -def main() -> None: - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_bin("golden_v2.bin", "v2.bin", np.int16) - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/vec-scalar/vadds-i16-signed/golden.py b/test/vpto/cases/micro-op/vec-scalar/vadds-i16-signed/golden.py deleted file mode 100644 index 23d6855f0..000000000 --- a/test/vpto/cases/micro-op/vec-scalar/vadds-i16-signed/golden.py +++ /dev/null @@ -1,47 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/vec-scalar/vadds-i16-signed -# family: vec-scalar -# target_ops: pto.vadds -# scenarios: core-i16-signed, full-mask, scalar-operand - -import argparse -from pathlib import Path - -import numpy as np - - -ELEMS = 1024 -SEED = 19 -SCALAR = np.int16(37) - - -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) - v1 = rng.integers(-12000, 12000, size=ELEMS, dtype=np.int16) - v2 = np.zeros(ELEMS, dtype=np.int16) - golden_v2 = (v1.astype(np.int32) + int(SCALAR)).astype(np.int16) - - output_dir.mkdir(parents=True, exist_ok=True) - v1.tofile(output_dir / "v1.bin") - v2.tofile(output_dir / "v2.bin") - golden_v2.tofile(output_dir / "golden_v2.bin") - - -def main() -> None: - parser = argparse.ArgumentParser() - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/vec-scalar/vadds-i16-signed/kernel.pto b/test/vpto/cases/micro-op/vec-scalar/vadds-i16-signed/kernel.pto deleted file mode 100644 index 9ebf5fa9f..000000000 --- a/test/vpto/cases/micro-op/vec-scalar/vadds-i16-signed/kernel.pto +++ /dev/null @@ -1,48 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/vec-scalar/vadds-i16-signed -// family: vec-scalar -// target_ops: pto.vadds -// scenarios: core-i16-signed, full-mask, scalar-operand -// ----------------------------------------------------------------------------- -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vadds_i16_signed_kernel(%arg0: !pto.ptr, %arg1: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c128 = arith.constant 128 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c64_i64 = arith.constant 64 : i64 - %c2048_i64 = arith.constant 2048 : i64 - %c1024_i32 = arith.constant 1024 : i32 - %scalar = arith.constant 37 : i16 - - %ub_in = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c2048_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_in, %c0_i64, %c64_i64 - nburst(%c32_i64, %c64_i64, %c64_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %mask = pto.pset_b16 "PAT_ALL" : !pto.mask - scf.for %offset = %c0 to %c1024 step %c128 { - %vec = pto.vlds %ub_in[%offset] : !pto.ptr -> !pto.vreg<128xi16> - %sum = pto.vadds %vec, %scalar, %mask : !pto.vreg<128xi16>, i16, !pto.mask -> !pto.vreg<128xi16> - pto.vsts %sum, %ub_out[%offset], %mask : !pto.vreg<128xi16>, !pto.ptr, !pto.mask - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg1, %c64_i64 - nburst(%c32_i64, %c64_i64, %c64_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/vec-scalar/vadds-i16-signed/launch.cpp b/test/vpto/cases/micro-op/vec-scalar/vadds-i16-signed/launch.cpp deleted file mode 100644 index fd275ede0..000000000 --- a/test/vpto/cases/micro-op/vec-scalar/vadds-i16-signed/launch.cpp +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vadds_i16_signed_kernel(__gm__ int16_t *v1, - __gm__ int16_t *v2); - -void LaunchVadds_i16_signed_kernel(int16_t *v1, int16_t *v2, void *stream) { - vadds_i16_signed_kernel<<<1, nullptr, stream>>>((__gm__ int16_t *)v1, - (__gm__ int16_t *)v2); -} diff --git a/test/vpto/cases/micro-op/vec-scalar/vadds-i16-signed/main.cpp b/test/vpto/cases/micro-op/vec-scalar/vadds-i16-signed/main.cpp deleted file mode 100644 index 3cbb6afba..000000000 --- a/test/vpto/cases/micro-op/vec-scalar/vadds-i16-signed/main.cpp +++ /dev/null @@ -1,80 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -#include "test_common.h" -#include "acl/acl.h" -#include -#include -#include - -using namespace PtoTestCommon; - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, \ - (int)_ret, __FILE__, __LINE__); \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVadds_i16_signed_kernel(int16_t *v1, int16_t *v2, void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(int16_t); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(int16_t); - int16_t *v1Host = nullptr; - int16_t *v1Device = nullptr; - int16_t *v2Host = nullptr; - int16_t *v2Device = nullptr; - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) - deviceId = std::atoi(envDevice); - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, - ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVadds_i16_signed_kernel(v1Device, v2Device, stream); - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v2Host, fileSize_v2, v2Device, fileSize_v2, - ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v2.bin", v2Host, fileSize_v2); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - if (stream != nullptr) - aclrtDestroyStream(stream); - if (deviceSet) - aclrtResetDevice(deviceId); - if (aclInited) - aclFinalize(); - return rc; -} diff --git a/test/vpto/cases/micro-op/vec-scalar/vadds-i16-unsigned-overflow/compare.py b/test/vpto/cases/micro-op/vec-scalar/vadds-i16-unsigned-overflow/compare.py deleted file mode 100755 index a1b852540..000000000 --- a/test/vpto/cases/micro-op/vec-scalar/vadds-i16-unsigned-overflow/compare.py +++ /dev/null @@ -1,41 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/vec-scalar/vadds-i16-unsigned-overflow -# family: vec-scalar -# target_ops: pto.vadds -# scenarios: core-i16-unsigned, full-mask, scalar-operand, integer-overflow - -import os -import sys -import numpy as np - - -def compare_bin(golden_path, output_path, dtype): - if not os.path.exists(golden_path) or not os.path.exists(output_path): - return False - golden = np.fromfile(golden_path, dtype=dtype) - output = np.fromfile(output_path, dtype=dtype) - return golden.shape == output.shape and np.array_equal(golden, output) - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_bin("golden_v2.bin", "v2.bin", np.uint16) - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/vec-scalar/vadds-i16-unsigned-overflow/golden.py b/test/vpto/cases/micro-op/vec-scalar/vadds-i16-unsigned-overflow/golden.py deleted file mode 100755 index 813ec0287..000000000 --- a/test/vpto/cases/micro-op/vec-scalar/vadds-i16-unsigned-overflow/golden.py +++ /dev/null @@ -1,64 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/vec-scalar/vadds-i16-unsigned-overflow -# family: vec-scalar -# target_ops: pto.vadds -# scenarios: core-i16-unsigned, full-mask, scalar-operand, integer-overflow - -import argparse -from pathlib import Path - -import numpy as np - - -ELEMS = 1024 -SEED = 19 -SCALAR = np.uint16(4096) - - -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) - v1 = rng.integers(0, 65535, size=ELEMS, dtype=np.uint16) - v1[:12] = np.array( - [ - 65535, - 65534, - 65500, - 65000, - 4096, - 2048, - 1024, - 1, - 0, - 32768, - 12345, - 54321, - ], - dtype=np.uint16, - ) - v2 = np.zeros(ELEMS, dtype=np.uint16) - golden_v2 = (v1.astype(np.uint32) + int(SCALAR)).astype(np.uint16) - - output_dir.mkdir(parents=True, exist_ok=True) - v1.tofile(output_dir / "v1.bin") - v2.tofile(output_dir / "v2.bin") - golden_v2.tofile(output_dir / "golden_v2.bin") - - -def main() -> None: - parser = argparse.ArgumentParser() - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/vec-scalar/vadds-i16-unsigned-overflow/kernel.pto b/test/vpto/cases/micro-op/vec-scalar/vadds-i16-unsigned-overflow/kernel.pto deleted file mode 100644 index 5918a0212..000000000 --- a/test/vpto/cases/micro-op/vec-scalar/vadds-i16-unsigned-overflow/kernel.pto +++ /dev/null @@ -1,47 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/vec-scalar/vadds-i16-unsigned-overflow -// family: vec-scalar -// target_ops: pto.vadds -// scenarios: core-i16-unsigned, full-mask, scalar-operand, integer-overflow -// ----------------------------------------------------------------------------- -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vadds_i16_unsigned_overflow_kernel(%arg0: !pto.ptr, - %arg1: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c128 = arith.constant 128 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c64_i64 = arith.constant 64 : i64 - %c2048_i64 = arith.constant 2048 : i64 - %scalar = arith.constant 4096 : i16 - %ub_in = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c2048_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_in, %c0_i64, %c64_i64 - nburst(%c32_i64, %c64_i64, %c64_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %mask = pto.pset_b16 "PAT_ALL" : !pto.mask - scf.for %offset = %c0 to %c1024 step %c128 { - %vec = pto.vlds %ub_in[%offset] : !pto.ptr -> !pto.vreg<128xui16> - %sum = pto.vadds %vec, %scalar, %mask : !pto.vreg<128xui16>, i16, !pto.mask -> !pto.vreg<128xui16> - pto.vsts %sum, %ub_out[%offset], %mask : !pto.vreg<128xui16>, !pto.ptr, !pto.mask - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg1, %c64_i64 - nburst(%c32_i64, %c64_i64, %c64_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/vec-scalar/vadds-i16-unsigned-overflow/launch.cpp b/test/vpto/cases/micro-op/vec-scalar/vadds-i16-unsigned-overflow/launch.cpp deleted file mode 100644 index 003c7556b..000000000 --- a/test/vpto/cases/micro-op/vec-scalar/vadds-i16-unsigned-overflow/launch.cpp +++ /dev/null @@ -1,51 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/vec-scalar/vadds-i16-unsigned-overflow -// family: vec-scalar -// target_ops: pto.vadds -// scenarios: core-i16-unsigned, full-mask, scalar-operand, integer-overflow -// ----------------------------------------------------------------------------- -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void -vadds_i16_unsigned_overflow_kernel(__gm__ uint16_t *v1, __gm__ uint16_t *v2); - -void LaunchVadds_i16_unsigned_overflow_kernel(uint16_t *v1, uint16_t *v2, - void *stream) { - vadds_i16_unsigned_overflow_kernel<<<1, nullptr, stream>>>( - (__gm__ uint16_t *)v1, (__gm__ uint16_t *)v2); -} diff --git a/test/vpto/cases/micro-op/vec-scalar/vadds-i16-unsigned-overflow/main.cpp b/test/vpto/cases/micro-op/vec-scalar/vadds-i16-unsigned-overflow/main.cpp deleted file mode 100644 index 8f8b2ebe6..000000000 --- a/test/vpto/cases/micro-op/vec-scalar/vadds-i16-unsigned-overflow/main.cpp +++ /dev/null @@ -1,81 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -#include "test_common.h" -#include "acl/acl.h" -#include -#include -#include - -using namespace PtoTestCommon; - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, \ - (int)_ret, __FILE__, __LINE__); \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVadds_i16_unsigned_overflow_kernel(uint16_t *v1, uint16_t *v2, - void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(uint16_t); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(uint16_t); - uint16_t *v1Host = nullptr; - uint16_t *v1Device = nullptr; - uint16_t *v2Host = nullptr; - uint16_t *v2Device = nullptr; - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) - deviceId = std::atoi(envDevice); - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, - ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVadds_i16_unsigned_overflow_kernel(v1Device, v2Device, stream); - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v2Host, fileSize_v2, v2Device, fileSize_v2, - ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v2.bin", v2Host, fileSize_v2); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - if (stream != nullptr) - aclrtDestroyStream(stream); - if (deviceSet) - aclrtResetDevice(deviceId); - if (aclInited) - aclFinalize(); - return rc; -} diff --git a/test/vpto/cases/micro-op/vec-scalar/vadds-i16-unsigned/compare.py b/test/vpto/cases/micro-op/vec-scalar/vadds-i16-unsigned/compare.py deleted file mode 100755 index 437f48ad7..000000000 --- a/test/vpto/cases/micro-op/vec-scalar/vadds-i16-unsigned/compare.py +++ /dev/null @@ -1,41 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/vec-scalar/vadds-i16-unsigned -# family: vec-scalar -# target_ops: pto.vadds -# scenarios: core-i16-unsigned, full-mask, scalar-operand - -import os -import sys -import numpy as np - - -def compare_bin(golden_path, output_path, dtype): - if not os.path.exists(golden_path) or not os.path.exists(output_path): - return False - golden = np.fromfile(golden_path, dtype=dtype) - output = np.fromfile(output_path, dtype=dtype) - return golden.shape == output.shape and np.array_equal(golden, output) - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_bin("golden_v2.bin", "v2.bin", np.uint16) - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/vec-scalar/vadds-i16-unsigned/golden.py b/test/vpto/cases/micro-op/vec-scalar/vadds-i16-unsigned/golden.py deleted file mode 100755 index df317a729..000000000 --- a/test/vpto/cases/micro-op/vec-scalar/vadds-i16-unsigned/golden.py +++ /dev/null @@ -1,47 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/vec-scalar/vadds-i16-unsigned -# family: vec-scalar -# target_ops: pto.vadds -# scenarios: core-i16-unsigned, full-mask, scalar-operand - -import argparse -from pathlib import Path - -import numpy as np - - -ELEMS = 1024 -SEED = 19 -SCALAR = np.uint16(37) - - -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) - v1 = rng.integers(0, 60000, size=ELEMS, dtype=np.uint16) - v2 = np.zeros(ELEMS, dtype=np.uint16) - golden_v2 = (v1.astype(np.uint32) + int(SCALAR)).astype(np.uint16) - - output_dir.mkdir(parents=True, exist_ok=True) - v1.tofile(output_dir / "v1.bin") - v2.tofile(output_dir / "v2.bin") - golden_v2.tofile(output_dir / "golden_v2.bin") - - -def main() -> None: - parser = argparse.ArgumentParser() - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/vec-scalar/vadds-i16-unsigned/kernel.pto b/test/vpto/cases/micro-op/vec-scalar/vadds-i16-unsigned/kernel.pto deleted file mode 100644 index 232b318d6..000000000 --- a/test/vpto/cases/micro-op/vec-scalar/vadds-i16-unsigned/kernel.pto +++ /dev/null @@ -1,47 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/vec-scalar/vadds-i16-unsigned -// family: vec-scalar -// target_ops: pto.vadds -// scenarios: core-i16-unsigned, full-mask, scalar-operand -// ----------------------------------------------------------------------------- -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vadds_i16_unsigned_kernel(%arg0: !pto.ptr, - %arg1: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c128 = arith.constant 128 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c64_i64 = arith.constant 64 : i64 - %c2048_i64 = arith.constant 2048 : i64 - %scalar = arith.constant 37 : i16 - %ub_in = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c2048_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_in, %c0_i64, %c64_i64 - nburst(%c32_i64, %c64_i64, %c64_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %mask = pto.pset_b16 "PAT_ALL" : !pto.mask - scf.for %offset = %c0 to %c1024 step %c128 { - %vec = pto.vlds %ub_in[%offset] : !pto.ptr -> !pto.vreg<128xui16> - %sum = pto.vadds %vec, %scalar, %mask : !pto.vreg<128xui16>, i16, !pto.mask -> !pto.vreg<128xui16> - pto.vsts %sum, %ub_out[%offset], %mask : !pto.vreg<128xui16>, !pto.ptr, !pto.mask - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg1, %c64_i64 - nburst(%c32_i64, %c64_i64, %c64_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/vec-scalar/vadds-i16-unsigned/launch.cpp b/test/vpto/cases/micro-op/vec-scalar/vadds-i16-unsigned/launch.cpp deleted file mode 100644 index 61ff83045..000000000 --- a/test/vpto/cases/micro-op/vec-scalar/vadds-i16-unsigned/launch.cpp +++ /dev/null @@ -1,50 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/vec-scalar/vadds-i16-unsigned -// family: vec-scalar -// target_ops: pto.vadds -// scenarios: core-i16-unsigned, full-mask, scalar-operand -// ----------------------------------------------------------------------------- -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vadds_i16_unsigned_kernel(__gm__ uint16_t *v1, - __gm__ uint16_t *v2); - -void LaunchVadds_i16_unsigned_kernel(uint16_t *v1, uint16_t *v2, void *stream) { - vadds_i16_unsigned_kernel<<<1, nullptr, stream>>>((__gm__ uint16_t *)v1, - (__gm__ uint16_t *)v2); -} diff --git a/test/vpto/cases/micro-op/vec-scalar/vadds-i16-unsigned/main.cpp b/test/vpto/cases/micro-op/vec-scalar/vadds-i16-unsigned/main.cpp deleted file mode 100644 index de7c50e82..000000000 --- a/test/vpto/cases/micro-op/vec-scalar/vadds-i16-unsigned/main.cpp +++ /dev/null @@ -1,90 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/vec-scalar/vadds-i16-unsigned -// family: vec-scalar -// target_ops: pto.vadds -// scenarios: core-i16-unsigned, full-mask, scalar-operand -// ----------------------------------------------------------------------------- -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include -#include - -using namespace PtoTestCommon; - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, \ - (int)_ret, __FILE__, __LINE__); \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVadds_i16_unsigned_kernel(uint16_t *v1, uint16_t *v2, void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(uint16_t); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(uint16_t); - uint16_t *v1Host = nullptr; - uint16_t *v1Device = nullptr; - uint16_t *v2Host = nullptr; - uint16_t *v2Device = nullptr; - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) - deviceId = std::atoi(envDevice); - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, - ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVadds_i16_unsigned_kernel(v1Device, v2Device, stream); - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v2Host, fileSize_v2, v2Device, fileSize_v2, - ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v2.bin", v2Host, fileSize_v2); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - if (stream != nullptr) - aclrtDestroyStream(stream); - if (deviceSet) - aclrtResetDevice(deviceId); - if (aclInited) - aclFinalize(); - return rc; -} diff --git a/test/vpto/cases/micro-op/vec-scalar/vadds-tail/compare.py b/test/vpto/cases/micro-op/vec-scalar/vadds-tail/compare.py deleted file mode 100644 index c13d79273..000000000 --- a/test/vpto/cases/micro-op/vec-scalar/vadds-tail/compare.py +++ /dev/null @@ -1,39 +0,0 @@ -#!/usr/bin/python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - - -import os -import sys -import numpy as np - - -def compare_bin_prefix(golden_path, output_path, dtype, eps, count): - if not os.path.exists(golden_path) or not os.path.exists(output_path): - return False - golden = np.fromfile(golden_path, dtype=dtype, count=count) - output = np.fromfile(output_path, dtype=dtype, count=count) - return golden.shape == output.shape and np.allclose( - golden, output, atol=eps, rtol=eps, equal_nan=True - ) - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_bin_prefix("golden_v2.bin", "v2.bin", np.float32, 1e-4, 1000) - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/vec-scalar/vadds-tail/golden.py b/test/vpto/cases/micro-op/vec-scalar/vadds-tail/golden.py deleted file mode 100644 index 2f06c22fa..000000000 --- a/test/vpto/cases/micro-op/vec-scalar/vadds-tail/golden.py +++ /dev/null @@ -1,50 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -ROWS = 32 -COLS = 32 -SEED = 19 -SCALE = np.float32(3.14) -LOGICAL_ELEMS = 1000 -OUT_SENTINEL = np.float32(-123.25) - - -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) - v1 = rng.random((ROWS, COLS), dtype=np.float32) - v2 = np.full((ROWS, COLS), OUT_SENTINEL, dtype=np.float32) - golden_v2 = np.full((ROWS, COLS), OUT_SENTINEL, dtype=np.float32) - golden_v2.reshape(-1)[:LOGICAL_ELEMS] = ( - v1.reshape(-1)[:LOGICAL_ELEMS] + SCALE - ).astype(np.float32, copy=False) - - output_dir.mkdir(parents=True, exist_ok=True) - v1.reshape(-1).tofile(output_dir / "v1.bin") - v2.reshape(-1).tofile(output_dir / "v2.bin") - golden_v2.reshape(-1).tofile(output_dir / "golden_v2.bin") - - -def main() -> None: - parser = argparse.ArgumentParser() - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/vec-scalar/vadds-tail/kernel.pto b/test/vpto/cases/micro-op/vec-scalar/vadds-tail/kernel.pto deleted file mode 100644 index c6784e700..000000000 --- a/test/vpto/cases/micro-op/vec-scalar/vadds-tail/kernel.pto +++ /dev/null @@ -1,44 +0,0 @@ -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vadds_tail_kernel_2d(%arg0: !pto.ptr, - %arg1: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c64 = arith.constant 64 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c128_i64 = arith.constant 128 : i64 - %c4096_i64 = arith.constant 4096 : i64 - %c1000_i32 = arith.constant 1000 : i32 - %cst = arith.constant 3.140000e+00 : f32 - - %ub_in = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c4096_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_in, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %_:1 = scf.for %offset = %c0 to %c1024 step %c64 iter_args(%remaining = %c1000_i32) -> (i32) { - %mask, %next_remaining = pto.plt_b32 %remaining : i32 -> !pto.mask, i32 - %vec = pto.vlds %ub_in[%offset] : !pto.ptr -> !pto.vreg<64xf32> - %sum = pto.vadds %vec, %cst, %mask : !pto.vreg<64xf32>, f32, !pto.mask -> !pto.vreg<64xf32> - pto.vsts %sum, %ub_out[%offset], %mask : !pto.vreg<64xf32>, !pto.ptr, !pto.mask - scf.yield %next_remaining : i32 - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg1, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/vec-scalar/vadds-tail/launch.cpp b/test/vpto/cases/micro-op/vec-scalar/vadds-tail/launch.cpp deleted file mode 100644 index b4cd46470..000000000 --- a/test/vpto/cases/micro-op/vec-scalar/vadds-tail/launch.cpp +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vadds_tail_kernel_2d(__gm__ float *v1, - __gm__ float *v2); - -void LaunchVadds_tail_kernel_2d(float *v1, float *v2, void *stream) { - vadds_tail_kernel_2d<<<1, nullptr, stream>>>((__gm__ float *)v1, - (__gm__ float *)v2); -} diff --git a/test/vpto/cases/micro-op/vec-scalar/vadds-tail/main.cpp b/test/vpto/cases/micro-op/vec-scalar/vadds-tail/main.cpp deleted file mode 100644 index ab77e6b1a..000000000 --- a/test/vpto/cases/micro-op/vec-scalar/vadds-tail/main.cpp +++ /dev/null @@ -1,83 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include - -using namespace PtoTestCommon; - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, \ - (int)_ret, __FILE__, __LINE__); \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVadds_tail_kernel_2d(float *v1, float *v2, void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(float); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(float); - float *v1Host = nullptr; - float *v1Device = nullptr; - float *v2Host = nullptr; - float *v2Device = nullptr; - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) - deviceId = std::atoi(envDevice); - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, - ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVadds_tail_kernel_2d(v1Device, v2Device, stream); - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v2Host, fileSize_v2, v2Device, fileSize_v2, - ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v2.bin", v2Host, fileSize_v2); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - if (stream != nullptr) - aclrtDestroyStream(stream); - if (deviceSet) - aclrtResetDevice(deviceId); - if (aclInited) - aclFinalize(); - return rc; -} diff --git a/test/vpto/cases/micro-op/vec-scalar/vadds/kernel.pto b/test/vpto/cases/micro-op/vec-scalar/vadds/kernel.pto index 672c20e84..f36635aac 100644 --- a/test/vpto/cases/micro-op/vec-scalar/vadds/kernel.pto +++ b/test/vpto/cases/micro-op/vec-scalar/vadds/kernel.pto @@ -1,45 +1,379 @@ module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vec_add_scalar_kernel_2d(%arg0: !pto.ptr, - %arg1: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c64 = arith.constant 64 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c128_i64 = arith.constant 128 : i64 - %c4096_i64 = arith.constant 4096 : i64 - %c1024_i32 = arith.constant 1024 : i32 - %cst = arith.constant 3.140000e+00 : f32 - - %ub_in = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c4096_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_in, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) + func.func @vadds_deep_merged_kernel(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: !pto.ptr, %arg4: !pto.ptr, %arg5: !pto.ptr, %arg6: !pto.ptr, %arg7: !pto.ptr, %arg8: !pto.ptr, %arg9: !pto.ptr, %arg10: !pto.ptr, %arg11: !pto.ptr, %arg12: !pto.ptr, %arg13: !pto.ptr, %arg14: !pto.ptr, %arg15: !pto.ptr, %arg16: !pto.ptr, %arg17: !pto.ptr) attributes {pto.kernel} { + %__deep_merge_guard = arith.constant false + // inactive merged from vadds_bf16_kernel + scf.if %__deep_merge_guard { + + %c0_m0 = arith.constant 0 : index + %c128_m0 = arith.constant 128 : index + %c1024_m0 = arith.constant 1024 : index + %c0_i64_m0 = arith.constant 0 : i64 + %c1_i64_m0 = arith.constant 1 : i64 + %c32_i64_m0 = arith.constant 32 : i64 + %c64_i64_m0 = arith.constant 64 : i64 + %c2048_i64_m0 = arith.constant 2048 : i64 + %cst_m0 = arith.constant 1.500000e+00 : bf16 + + %ub_in_m0 = pto.castptr %c0_i64_m0 : i64 -> !pto.ptr + %ub_out_m0 = pto.castptr %c2048_i64_m0 : i64 -> !pto.ptr + + %false_m0 = arith.constant false + pto.mte_gm_ub %arg0, %ub_in_m0, %c0_i64_m0, %c64_i64_m0 + nburst(%c32_i64_m0, %c64_i64_m0, %c64_i64_m0) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %mask_m0 = pto.pset_b16 "PAT_ALL" : !pto.mask + scf.for %offset_m0 = %c0_m0 to %c1024_m0 step %c128_m0 { + %vec_m0 = pto.vlds %ub_in_m0[%offset_m0] : !pto.ptr -> !pto.vreg<128xbf16> + %sum_m0 = pto.vadds %vec_m0, %cst_m0, %mask_m0 : !pto.vreg<128xbf16>, bf16, !pto.mask -> !pto.vreg<128xbf16> + pto.vsts %sum_m0, %ub_out_m0[%offset_m0], %mask_m0 : !pto.vreg<128xbf16>, !pto.ptr, !pto.mask + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m0, %arg1, %c64_i64_m0 + nburst(%c32_i64_m0, %c64_i64_m0, %c64_i64_m0) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + + } + // inactive merged from vadds_f16_kernel + scf.if %__deep_merge_guard { + + %c0_m1 = arith.constant 0 : index + %c128_m1 = arith.constant 128 : index + %c1024_m1 = arith.constant 1024 : index + %c0_i64_m1 = arith.constant 0 : i64 + %c1_i64_m1 = arith.constant 1 : i64 + %c32_i64_m1 = arith.constant 32 : i64 + %c64_i64_m1 = arith.constant 64 : i64 + %c2048_i64_m1 = arith.constant 2048 : i64 + %cst_m1 = arith.constant 1.500000e+00 : f16 + + %ub_in_m1 = pto.castptr %c0_i64_m1 : i64 -> !pto.ptr + %ub_out_m1 = pto.castptr %c2048_i64_m1 : i64 -> !pto.ptr + + %false_m1 = arith.constant false + pto.mte_gm_ub %arg2, %ub_in_m1, %c0_i64_m1, %c64_i64_m1 + nburst(%c32_i64_m1, %c64_i64_m1, %c64_i64_m1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %mask_m1 = pto.pset_b16 "PAT_ALL" : !pto.mask + scf.for %offset_m1 = %c0_m1 to %c1024_m1 step %c128_m1 { + %vec_m1 = pto.vlds %ub_in_m1[%offset_m1] : !pto.ptr -> !pto.vreg<128xf16> + %sum_m1 = pto.vadds %vec_m1, %cst_m1, %mask_m1 : !pto.vreg<128xf16>, f16, !pto.mask -> !pto.vreg<128xf16> + pto.vsts %sum_m1, %ub_out_m1[%offset_m1], %mask_m1 : !pto.vreg<128xf16>, !pto.ptr, !pto.mask + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m1, %arg3, %c64_i64_m1 + nburst(%c32_i64_m1, %c64_i64_m1, %c64_i64_m1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + + } + // inactive merged from vadds_f32_exceptional_kernel_2d + scf.if %__deep_merge_guard { + + %c0_m2 = arith.constant 0 : index + %c64_m2 = arith.constant 64 : index + %c1024_m2 = arith.constant 1024 : index + %c0_i64_m2 = arith.constant 0 : i64 + %c1_i64_m2 = arith.constant 1 : i64 + %c32_i64_m2 = arith.constant 32 : i64 + %c128_i64_m2 = arith.constant 128 : i64 + %c4096_i64_m2 = arith.constant 4096 : i64 + %c1024_i32_m2 = arith.constant 1024 : i32 + %cst_m2 = arith.constant 5.000000e-01 : f32 + + %ub_in_m2 = pto.castptr %c0_i64_m2 : i64 -> !pto.ptr + %ub_out_m2 = pto.castptr %c4096_i64_m2 : i64 -> !pto.ptr + + %false_m2 = arith.constant false + pto.mte_gm_ub %arg4, %ub_in_m2, %c0_i64_m2, %c128_i64_m2 + nburst(%c32_i64_m2, %c128_i64_m2, %c128_i64_m2) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %__m2:1 = scf.for %offset_m2 = %c0_m2 to %c1024_m2 step %c64_m2 iter_args(%remaining_m2 = %c1024_i32_m2) -> (i32) { + %mask_m2, %next_remaining_m2 = pto.plt_b32 %remaining_m2 : i32 -> !pto.mask, i32 + %vec_m2 = pto.vlds %ub_in_m2[%offset_m2] : !pto.ptr -> !pto.vreg<64xf32> + %sum_m2 = pto.vadds %vec_m2, %cst_m2, %mask_m2 : !pto.vreg<64xf32>, f32, !pto.mask -> !pto.vreg<64xf32> + pto.vsts %sum_m2, %ub_out_m2[%offset_m2], %mask_m2 : !pto.vreg<64xf32>, !pto.ptr, !pto.mask + scf.yield %next_remaining_m2 : i32 + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m2, %arg5, %c128_i64_m2 + nburst(%c32_i64_m2, %c128_i64_m2, %c128_i64_m2) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + + } + // inactive merged from vadds_i16_signed_kernel + scf.if %__deep_merge_guard { + + %c0_m3 = arith.constant 0 : index + %c128_m3 = arith.constant 128 : index + %c1024_m3 = arith.constant 1024 : index + %c0_i64_m3 = arith.constant 0 : i64 + %c1_i64_m3 = arith.constant 1 : i64 + %c32_i64_m3 = arith.constant 32 : i64 + %c64_i64_m3 = arith.constant 64 : i64 + %c2048_i64_m3 = arith.constant 2048 : i64 + %c1024_i32_m3 = arith.constant 1024 : i32 + %scalar_m3 = arith.constant 37 : i16 + + %ub_in_m3 = pto.castptr %c0_i64_m3 : i64 -> !pto.ptr + %ub_out_m3 = pto.castptr %c2048_i64_m3 : i64 -> !pto.ptr + + %false_m3 = arith.constant false + pto.mte_gm_ub %arg6, %ub_in_m3, %c0_i64_m3, %c64_i64_m3 + nburst(%c32_i64_m3, %c64_i64_m3, %c64_i64_m3) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %mask_m3 = pto.pset_b16 "PAT_ALL" : !pto.mask + scf.for %offset_m3 = %c0_m3 to %c1024_m3 step %c128_m3 { + %vec_m3 = pto.vlds %ub_in_m3[%offset_m3] : !pto.ptr -> !pto.vreg<128xi16> + %sum_m3 = pto.vadds %vec_m3, %scalar_m3, %mask_m3 : !pto.vreg<128xi16>, i16, !pto.mask -> !pto.vreg<128xi16> + pto.vsts %sum_m3, %ub_out_m3[%offset_m3], %mask_m3 : !pto.vreg<128xi16>, !pto.ptr, !pto.mask + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m3, %arg7, %c64_i64_m3 + nburst(%c32_i64_m3, %c64_i64_m3, %c64_i64_m3) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + + } + // inactive merged from vadds_i16_signed_overflow_kernel + scf.if %__deep_merge_guard { + + %c0_m4 = arith.constant 0 : index + %c128_m4 = arith.constant 128 : index + %c1024_m4 = arith.constant 1024 : index + %c0_i64_m4 = arith.constant 0 : i64 + %c1_i64_m4 = arith.constant 1 : i64 + %c32_i64_m4 = arith.constant 32 : i64 + %c64_i64_m4 = arith.constant 64 : i64 + %c2048_i64_m4 = arith.constant 2048 : i64 + %scalar_m4 = arith.constant 1024 : i16 + + %ub_in_m4 = pto.castptr %c0_i64_m4 : i64 -> !pto.ptr + %ub_out_m4 = pto.castptr %c2048_i64_m4 : i64 -> !pto.ptr + + %false_m4 = arith.constant false + pto.mte_gm_ub %arg8, %ub_in_m4, %c0_i64_m4, %c64_i64_m4 + nburst(%c32_i64_m4, %c64_i64_m4, %c64_i64_m4) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %mask_m4 = pto.pset_b16 "PAT_ALL" : !pto.mask + scf.for %offset_m4 = %c0_m4 to %c1024_m4 step %c128_m4 { + %vec_m4 = pto.vlds %ub_in_m4[%offset_m4] : !pto.ptr -> !pto.vreg<128xi16> + %sum_m4 = pto.vadds %vec_m4, %scalar_m4, %mask_m4 : !pto.vreg<128xi16>, i16, !pto.mask -> !pto.vreg<128xi16> + pto.vsts %sum_m4, %ub_out_m4[%offset_m4], %mask_m4 : !pto.vreg<128xi16>, !pto.ptr, !pto.mask + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m4, %arg9, %c64_i64_m4 + nburst(%c32_i64_m4, %c64_i64_m4, %c64_i64_m4) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + + } + // inactive merged from vadds_i16_unsigned_kernel + scf.if %__deep_merge_guard { + + %c0_m5 = arith.constant 0 : index + %c128_m5 = arith.constant 128 : index + %c1024_m5 = arith.constant 1024 : index + %c0_i64_m5 = arith.constant 0 : i64 + %c1_i64_m5 = arith.constant 1 : i64 + %c32_i64_m5 = arith.constant 32 : i64 + %c64_i64_m5 = arith.constant 64 : i64 + %c2048_i64_m5 = arith.constant 2048 : i64 + %scalar_m5 = arith.constant 37 : i16 + %ub_in_m5 = pto.castptr %c0_i64_m5 : i64 -> !pto.ptr + %ub_out_m5 = pto.castptr %c2048_i64_m5 : i64 -> !pto.ptr + + %false_m5 = arith.constant false + pto.mte_gm_ub %arg10, %ub_in_m5, %c0_i64_m5, %c64_i64_m5 + nburst(%c32_i64_m5, %c64_i64_m5, %c64_i64_m5) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %mask_m5 = pto.pset_b16 "PAT_ALL" : !pto.mask + scf.for %offset_m5 = %c0_m5 to %c1024_m5 step %c128_m5 { + %vec_m5 = pto.vlds %ub_in_m5[%offset_m5] : !pto.ptr -> !pto.vreg<128xui16> + %sum_m5 = pto.vadds %vec_m5, %scalar_m5, %mask_m5 : !pto.vreg<128xui16>, i16, !pto.mask -> !pto.vreg<128xui16> + pto.vsts %sum_m5, %ub_out_m5[%offset_m5], %mask_m5 : !pto.vreg<128xui16>, !pto.ptr, !pto.mask + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m5, %arg11, %c64_i64_m5 + nburst(%c32_i64_m5, %c64_i64_m5, %c64_i64_m5) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + + } + // inactive merged from vadds_i16_unsigned_overflow_kernel + scf.if %__deep_merge_guard { + + %c0_m6 = arith.constant 0 : index + %c128_m6 = arith.constant 128 : index + %c1024_m6 = arith.constant 1024 : index + %c0_i64_m6 = arith.constant 0 : i64 + %c1_i64_m6 = arith.constant 1 : i64 + %c32_i64_m6 = arith.constant 32 : i64 + %c64_i64_m6 = arith.constant 64 : i64 + %c2048_i64_m6 = arith.constant 2048 : i64 + %scalar_m6 = arith.constant 4096 : i16 + %ub_in_m6 = pto.castptr %c0_i64_m6 : i64 -> !pto.ptr + %ub_out_m6 = pto.castptr %c2048_i64_m6 : i64 -> !pto.ptr + + %false_m6 = arith.constant false + pto.mte_gm_ub %arg12, %ub_in_m6, %c0_i64_m6, %c64_i64_m6 + nburst(%c32_i64_m6, %c64_i64_m6, %c64_i64_m6) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %mask_m6 = pto.pset_b16 "PAT_ALL" : !pto.mask + scf.for %offset_m6 = %c0_m6 to %c1024_m6 step %c128_m6 { + %vec_m6 = pto.vlds %ub_in_m6[%offset_m6] : !pto.ptr -> !pto.vreg<128xui16> + %sum_m6 = pto.vadds %vec_m6, %scalar_m6, %mask_m6 : !pto.vreg<128xui16>, i16, !pto.mask -> !pto.vreg<128xui16> + pto.vsts %sum_m6, %ub_out_m6[%offset_m6], %mask_m6 : !pto.vreg<128xui16>, !pto.ptr, !pto.mask + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m6, %arg13, %c64_i64_m6 + nburst(%c32_i64_m6, %c64_i64_m6, %c64_i64_m6) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + + } + // inactive merged from vadds_tail_kernel_2d + scf.if %__deep_merge_guard { + + %c0_m7 = arith.constant 0 : index + %c64_m7 = arith.constant 64 : index + %c1024_m7 = arith.constant 1024 : index + %c0_i64_m7 = arith.constant 0 : i64 + %c1_i64_m7 = arith.constant 1 : i64 + %c32_i64_m7 = arith.constant 32 : i64 + %c128_i64_m7 = arith.constant 128 : i64 + %c4096_i64_m7 = arith.constant 4096 : i64 + %c1000_i32_m7 = arith.constant 1000 : i32 + %cst_m7 = arith.constant 3.140000e+00 : f32 + + %ub_in_m7 = pto.castptr %c0_i64_m7 : i64 -> !pto.ptr + %ub_out_m7 = pto.castptr %c4096_i64_m7 : i64 -> !pto.ptr + + %false_m7 = arith.constant false + pto.mte_gm_ub %arg14, %ub_in_m7, %c0_i64_m7, %c128_i64_m7 + nburst(%c32_i64_m7, %c128_i64_m7, %c128_i64_m7) : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] pto.vecscope { - %_:1 = scf.for %offset = %c0 to %c1024 step %c64 iter_args(%remaining = %c1024_i32) -> (i32) { - %mask, %next_remaining = pto.plt_b32 %remaining : i32 -> !pto.mask, i32 - %vec = pto.vlds %ub_in[%offset] : !pto.ptr -> !pto.vreg<64xf32> - %sum = pto.vadds %vec, %cst, %mask : !pto.vreg<64xf32>, f32, !pto.mask -> !pto.vreg<64xf32> - pto.vsts %sum, %ub_out[%offset], %mask : !pto.vreg<64xf32>, !pto.ptr, !pto.mask - scf.yield %next_remaining : i32 + %__m7:1 = scf.for %offset_m7 = %c0_m7 to %c1024_m7 step %c64_m7 iter_args(%remaining_m7 = %c1000_i32_m7) -> (i32) { + %mask_m7, %next_remaining_m7 = pto.plt_b32 %remaining_m7 : i32 -> !pto.mask, i32 + %vec_m7 = pto.vlds %ub_in_m7[%offset_m7] : !pto.ptr -> !pto.vreg<64xf32> + %sum_m7 = pto.vadds %vec_m7, %cst_m7, %mask_m7 : !pto.vreg<64xf32>, f32, !pto.mask -> !pto.vreg<64xf32> + pto.vsts %sum_m7, %ub_out_m7[%offset_m7], %mask_m7 : !pto.vreg<64xf32>, !pto.ptr, !pto.mask + scf.yield %next_remaining_m7 : i32 } } pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg1, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) + pto.mte_ub_gm %ub_out_m7, %arg15, %c128_i64_m7 + nburst(%c32_i64_m7, %c128_i64_m7, %c128_i64_m7) : !pto.ptr, !pto.ptr, i64, i64, i64, i64 pto.barrier #pto.pipe + + } + // active merged from vec_add_scalar_kernel_2d + + %c0_m8 = arith.constant 0 : index + %c1_m8 = arith.constant 1 : index + %c64_m8 = arith.constant 64 : index + %c1024_m8 = arith.constant 1024 : index + %c0_i64_m8 = arith.constant 0 : i64 + %c1_i64_m8 = arith.constant 1 : i64 + %c32_i64_m8 = arith.constant 32 : i64 + %c128_i64_m8 = arith.constant 128 : i64 + %c4096_i64_m8 = arith.constant 4096 : i64 + %c1024_i32_m8 = arith.constant 1024 : i32 + %cst_m8 = arith.constant 3.140000e+00 : f32 + + %ub_in_m8 = pto.castptr %c0_i64_m8 : i64 -> !pto.ptr + %ub_out_m8 = pto.castptr %c4096_i64_m8 : i64 -> !pto.ptr + + %false_m8 = arith.constant false + pto.mte_gm_ub %arg16, %ub_in_m8, %c0_i64_m8, %c128_i64_m8 + nburst(%c32_i64_m8, %c128_i64_m8, %c128_i64_m8) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %__m8:1 = scf.for %offset_m8 = %c0_m8 to %c1024_m8 step %c64_m8 iter_args(%remaining_m8 = %c1024_i32_m8) -> (i32) { + %mask_m8, %next_remaining_m8 = pto.plt_b32 %remaining_m8 : i32 -> !pto.mask, i32 + %vec_m8 = pto.vlds %ub_in_m8[%offset_m8] : !pto.ptr -> !pto.vreg<64xf32> + %sum_m8 = pto.vadds %vec_m8, %cst_m8, %mask_m8 : !pto.vreg<64xf32>, f32, !pto.mask -> !pto.vreg<64xf32> + pto.vsts %sum_m8, %ub_out_m8[%offset_m8], %mask_m8 : !pto.vreg<64xf32>, !pto.ptr, !pto.mask + scf.yield %next_remaining_m8 : i32 + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m8, %arg17, %c128_i64_m8 + nburst(%c32_i64_m8, %c128_i64_m8, %c128_i64_m8) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + return } } diff --git a/test/vpto/cases/micro-op/vec-scalar/vadds/launch.cpp b/test/vpto/cases/micro-op/vec-scalar/vadds/launch.cpp index 44c07c249..4bd3e9a97 100644 --- a/test/vpto/cases/micro-op/vec-scalar/vadds/launch.cpp +++ b/test/vpto/cases/micro-op/vec-scalar/vadds/launch.cpp @@ -5,11 +5,9 @@ // THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, // INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. // See LICENSE in the root of the software repository for the full text of the License. - #ifndef __VEC_SCOPE__ #define __VEC_SCOPE__ #endif - #if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) typedef struct { unsigned char v; } hifloat8_t; typedef struct { unsigned char v; } float8_e4m3_t; @@ -22,23 +20,51 @@ typedef struct { unsigned char v; } float4_e2m1x2_t; #if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) #include #endif - #if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; +struct MrgSortExecutedNumList { uint16_t mrgSortList0,mrgSortList1,mrgSortList2,mrgSortList3; }; #endif #ifndef __CPU_SIM #include "acl/acl.h" #endif -extern "C" __global__ [aicore] void vec_add_scalar_kernel_2d(__gm__ float *v1, - __gm__ float *v2); +extern "C" __global__ [aicore] void vadds_deep_merged_kernel( + __gm__ bfloat16_t * arg0, + __gm__ bfloat16_t * arg1, + __gm__ half * arg2, + __gm__ half * arg3, + __gm__ float * arg4, + __gm__ float * arg5, + __gm__ int16_t * arg6, + __gm__ int16_t * arg7, + __gm__ int16_t * arg8, + __gm__ int16_t * arg9, + __gm__ uint16_t * arg10, + __gm__ uint16_t * arg11, + __gm__ uint16_t * arg12, + __gm__ uint16_t * arg13, + __gm__ float * arg14, + __gm__ float * arg15, + __gm__ float * arg16, + __gm__ float * arg17); -void LaunchVec_add_scalar_kernel_2d(float *v1, float *v2, void *stream) { - vec_add_scalar_kernel_2d<<<1, nullptr, stream>>>((__gm__ float *)v1, - (__gm__ float *)v2); +void LaunchVaddsDeepMerged(float * p0, float * p1, void *stream) { + vadds_deep_merged_kernel<<<1, nullptr, stream>>>( + (__gm__ bfloat16_t *)p0, + (__gm__ bfloat16_t *)p0, + (__gm__ half *)p0, + (__gm__ half *)p0, + (__gm__ float *)p0, + (__gm__ float *)p0, + (__gm__ int16_t *)p0, + (__gm__ int16_t *)p0, + (__gm__ int16_t *)p0, + (__gm__ int16_t *)p0, + (__gm__ uint16_t *)p0, + (__gm__ uint16_t *)p0, + (__gm__ uint16_t *)p0, + (__gm__ uint16_t *)p0, + (__gm__ float *)p0, + (__gm__ float *)p0, + (__gm__ float *)p0, + (__gm__ float *)p1); } diff --git a/test/vpto/cases/micro-op/vec-scalar/vadds/main.cpp b/test/vpto/cases/micro-op/vec-scalar/vadds/main.cpp index fcb42331f..93f4c6336 100644 --- a/test/vpto/cases/micro-op/vec-scalar/vadds/main.cpp +++ b/test/vpto/cases/micro-op/vec-scalar/vadds/main.cpp @@ -28,8 +28,8 @@ using namespace PtoTestCommon; } \ } while (0) -void LaunchVec_add_scalar_kernel_2d(float *v1, float *v2, void *stream); +void LaunchVaddsDeepMerged(float * p0, float * p1, void *stream); int main() { size_t elemCount_v1 = 1024; size_t fileSize_v1 = elemCount_v1 * sizeof(float); @@ -62,7 +62,7 @@ int main() { ACL_MEMCPY_HOST_TO_DEVICE)); ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVec_add_scalar_kernel_2d(v1Device, v2Device, stream); + LaunchVaddsDeepMerged(v1Device, v2Device, stream); ACL_CHECK(aclrtSynchronizeStream(stream)); ACL_CHECK(aclrtMemcpy(v2Host, fileSize_v2, v2Device, fileSize_v2, ACL_MEMCPY_DEVICE_TO_HOST)); diff --git a/test/vpto/cases/micro-op/vec-scalar/vmaxs-tail/compare.py b/test/vpto/cases/micro-op/vec-scalar/vmaxs-tail/compare.py deleted file mode 100644 index c13d79273..000000000 --- a/test/vpto/cases/micro-op/vec-scalar/vmaxs-tail/compare.py +++ /dev/null @@ -1,39 +0,0 @@ -#!/usr/bin/python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - - -import os -import sys -import numpy as np - - -def compare_bin_prefix(golden_path, output_path, dtype, eps, count): - if not os.path.exists(golden_path) or not os.path.exists(output_path): - return False - golden = np.fromfile(golden_path, dtype=dtype, count=count) - output = np.fromfile(output_path, dtype=dtype, count=count) - return golden.shape == output.shape and np.allclose( - golden, output, atol=eps, rtol=eps, equal_nan=True - ) - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_bin_prefix("golden_v2.bin", "v2.bin", np.float32, 1e-4, 1000) - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/vec-scalar/vmaxs-tail/golden.py b/test/vpto/cases/micro-op/vec-scalar/vmaxs-tail/golden.py deleted file mode 100644 index 0b08cbcab..000000000 --- a/test/vpto/cases/micro-op/vec-scalar/vmaxs-tail/golden.py +++ /dev/null @@ -1,50 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -ROWS = 32 -COLS = 32 -SEED = 19 -SCALE = np.float32(3.14) -LOGICAL_ELEMS = 1000 -OUT_SENTINEL = np.float32(-123.25) - - -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) - v1 = rng.random((ROWS, COLS), dtype=np.float32) - v2 = np.full((ROWS, COLS), OUT_SENTINEL, dtype=np.float32) - golden_v2 = np.full((ROWS, COLS), OUT_SENTINEL, dtype=np.float32) - golden_v2.reshape(-1)[:LOGICAL_ELEMS] = np.maximum( - v1.reshape(-1)[:LOGICAL_ELEMS], SCALE - ).astype(np.float32, copy=False) - - output_dir.mkdir(parents=True, exist_ok=True) - v1.reshape(-1).tofile(output_dir / "v1.bin") - v2.reshape(-1).tofile(output_dir / "v2.bin") - golden_v2.reshape(-1).tofile(output_dir / "golden_v2.bin") - - -def main() -> None: - parser = argparse.ArgumentParser() - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/vec-scalar/vmaxs-tail/kernel.pto b/test/vpto/cases/micro-op/vec-scalar/vmaxs-tail/kernel.pto deleted file mode 100644 index de40999f8..000000000 --- a/test/vpto/cases/micro-op/vec-scalar/vmaxs-tail/kernel.pto +++ /dev/null @@ -1,44 +0,0 @@ -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vmaxs_tail_kernel_2d(%arg0: !pto.ptr, - %arg1: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c64 = arith.constant 64 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c128_i64 = arith.constant 128 : i64 - %c4096_i64 = arith.constant 4096 : i64 - %c1000_i32 = arith.constant 1000 : i32 - %cst = arith.constant 3.140000e+00 : f32 - - %ub_in = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c4096_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_in, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %_:1 = scf.for %offset = %c0 to %c1024 step %c64 iter_args(%remaining = %c1000_i32) -> (i32) { - %mask, %next_remaining = pto.plt_b32 %remaining : i32 -> !pto.mask, i32 - %vec = pto.vlds %ub_in[%offset] : !pto.ptr -> !pto.vreg<64xf32> - %maxv = pto.vmaxs %vec, %cst, %mask : !pto.vreg<64xf32>, f32, !pto.mask -> !pto.vreg<64xf32> - pto.vsts %maxv, %ub_out[%offset], %mask : !pto.vreg<64xf32>, !pto.ptr, !pto.mask - scf.yield %next_remaining : i32 - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg1, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/vec-scalar/vmaxs-tail/launch.cpp b/test/vpto/cases/micro-op/vec-scalar/vmaxs-tail/launch.cpp deleted file mode 100644 index d5ae524ce..000000000 --- a/test/vpto/cases/micro-op/vec-scalar/vmaxs-tail/launch.cpp +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vmaxs_tail_kernel_2d(__gm__ float *v1, - __gm__ float *v2); - -void LaunchVadds_tail_kernel_2d(float *v1, float *v2, void *stream) { - vmaxs_tail_kernel_2d<<<1, nullptr, stream>>>((__gm__ float *)v1, - (__gm__ float *)v2); -} diff --git a/test/vpto/cases/micro-op/vec-scalar/vmaxs-tail/main.cpp b/test/vpto/cases/micro-op/vec-scalar/vmaxs-tail/main.cpp deleted file mode 100644 index ab77e6b1a..000000000 --- a/test/vpto/cases/micro-op/vec-scalar/vmaxs-tail/main.cpp +++ /dev/null @@ -1,83 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include - -using namespace PtoTestCommon; - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, \ - (int)_ret, __FILE__, __LINE__); \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVadds_tail_kernel_2d(float *v1, float *v2, void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(float); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(float); - float *v1Host = nullptr; - float *v1Device = nullptr; - float *v2Host = nullptr; - float *v2Device = nullptr; - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) - deviceId = std::atoi(envDevice); - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, - ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVadds_tail_kernel_2d(v1Device, v2Device, stream); - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v2Host, fileSize_v2, v2Device, fileSize_v2, - ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v2.bin", v2Host, fileSize_v2); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - if (stream != nullptr) - aclrtDestroyStream(stream); - if (deviceSet) - aclrtResetDevice(deviceId); - if (aclInited) - aclFinalize(); - return rc; -} diff --git a/test/vpto/cases/micro-op/vec-scalar/vmaxs/kernel.pto b/test/vpto/cases/micro-op/vec-scalar/vmaxs/kernel.pto index f57b56001..52b3b9691 100644 --- a/test/vpto/cases/micro-op/vec-scalar/vmaxs/kernel.pto +++ b/test/vpto/cases/micro-op/vec-scalar/vmaxs/kernel.pto @@ -1,45 +1,272 @@ module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vec_max_scalar_kernel_2d(%arg0: !pto.ptr, - %arg1: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c64 = arith.constant 64 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c128_i64 = arith.constant 128 : i64 - %c4096_i64 = arith.constant 4096 : i64 - %c1024_i32 = arith.constant 1024 : i32 - %cst = arith.constant 3.140000e+00 : f32 - - %ub_in = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c4096_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_in, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) + func.func @vmaxs_deep_merged_kernel(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: !pto.ptr) attributes {pto.kernel} { + %__deep_merge_guard = arith.constant false + // inactive merged from vmaxs_tail_kernel_2d + scf.if %__deep_merge_guard { + + %c0_m0 = arith.constant 0 : index + %c64_m0 = arith.constant 64 : index + %c1024_m0 = arith.constant 1024 : index + %c0_i64_m0 = arith.constant 0 : i64 + %c1_i64_m0 = arith.constant 1 : i64 + %c32_i64_m0 = arith.constant 32 : i64 + %c128_i64_m0 = arith.constant 128 : i64 + %c4096_i64_m0 = arith.constant 4096 : i64 + %c1000_i32_m0 = arith.constant 1000 : i32 + %cst_m0 = arith.constant 3.140000e+00 : f32 + + %ub_in_m0 = pto.castptr %c0_i64_m0 : i64 -> !pto.ptr + %ub_out_m0 = pto.castptr %c4096_i64_m0 : i64 -> !pto.ptr + + %false_m0 = arith.constant false + pto.mte_gm_ub %arg0, %ub_in_m0, %c0_i64_m0, %c128_i64_m0 + nburst(%c32_i64_m0, %c128_i64_m0, %c128_i64_m0) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %__m0:1 = scf.for %offset_m0 = %c0_m0 to %c1024_m0 step %c64_m0 iter_args(%remaining_m0 = %c1000_i32_m0) -> (i32) { + %mask_m0, %next_remaining_m0 = pto.plt_b32 %remaining_m0 : i32 -> !pto.mask, i32 + %vec_m0 = pto.vlds %ub_in_m0[%offset_m0] : !pto.ptr -> !pto.vreg<64xf32> + %maxv_m0 = pto.vmaxs %vec_m0, %cst_m0, %mask_m0 : !pto.vreg<64xf32>, f32, !pto.mask -> !pto.vreg<64xf32> + pto.vsts %maxv_m0, %ub_out_m0[%offset_m0], %mask_m0 : !pto.vreg<64xf32>, !pto.ptr, !pto.mask + scf.yield %next_remaining_m0 : i32 + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m0, %arg1, %c128_i64_m0 + nburst(%c32_i64_m0, %c128_i64_m0, %c128_i64_m0) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + + } + // active merged from vec_max_scalar_kernel_2d + + %c0_m1 = arith.constant 0 : index + %c1_m1 = arith.constant 1 : index + %c64_m1 = arith.constant 64 : index + %c1024_m1 = arith.constant 1024 : index + %c0_i64_m1 = arith.constant 0 : i64 + %c1_i64_m1 = arith.constant 1 : i64 + %c32_i64_m1 = arith.constant 32 : i64 + %c128_i64_m1 = arith.constant 128 : i64 + %c4096_i64_m1 = arith.constant 4096 : i64 + %c1024_i32_m1 = arith.constant 1024 : i32 + %cst_m1 = arith.constant 3.140000e+00 : f32 + + %ub_in_m1 = pto.castptr %c0_i64_m1 : i64 -> !pto.ptr + %ub_out_m1 = pto.castptr %c4096_i64_m1 : i64 -> !pto.ptr + + %false_m1 = arith.constant false + pto.mte_gm_ub %arg2, %ub_in_m1, %c0_i64_m1, %c128_i64_m1 + nburst(%c32_i64_m1, %c128_i64_m1, %c128_i64_m1) : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] pto.vecscope { - %_:1 = scf.for %offset = %c0 to %c1024 step %c64 iter_args(%remaining = %c1024_i32) -> (i32) { - %mask, %next_remaining = pto.plt_b32 %remaining : i32 -> !pto.mask, i32 - %vec = pto.vlds %ub_in[%offset] : !pto.ptr -> !pto.vreg<64xf32> - %maxv = pto.vmaxs %vec, %cst, %mask : !pto.vreg<64xf32>, f32, !pto.mask -> !pto.vreg<64xf32> - pto.vsts %maxv, %ub_out[%offset], %mask : !pto.vreg<64xf32>, !pto.ptr, !pto.mask - scf.yield %next_remaining : i32 + %__m1:1 = scf.for %offset_m1 = %c0_m1 to %c1024_m1 step %c64_m1 iter_args(%remaining_m1 = %c1024_i32_m1) -> (i32) { + %mask_m1, %next_remaining_m1 = pto.plt_b32 %remaining_m1 : i32 -> !pto.mask, i32 + %vec_m1 = pto.vlds %ub_in_m1[%offset_m1] : !pto.ptr -> !pto.vreg<64xf32> + %maxv_m1 = pto.vmaxs %vec_m1, %cst_m1, %mask_m1 : !pto.vreg<64xf32>, f32, !pto.mask -> !pto.vreg<64xf32> + pto.vsts %maxv_m1, %ub_out_m1[%offset_m1], %mask_m1 : !pto.vreg<64xf32>, !pto.ptr, !pto.mask + scf.yield %next_remaining_m1 : i32 } } pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg1, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) + pto.mte_ub_gm %ub_out_m1, %arg3, %c128_i64_m1 + nburst(%c32_i64_m1, %c128_i64_m1, %c128_i64_m1) : !pto.ptr, !pto.ptr, i64, i64, i64, i64 pto.barrier #pto.pipe + %__case_merge_guard = arith.constant false + + // inactive cross-case merge from micro-op/vec-scalar/vmins + scf.if %__case_merge_guard { + + %__deep_merge_guard_cmg18_1 = arith.constant false + // inactive merged from vmins_tail_kernel_2d + scf.if %__deep_merge_guard_cmg18_1 { + + %c0_m0_cmg18_1 = arith.constant 0 : index + %c64_m0_cmg18_1 = arith.constant 64 : index + %c1024_m0_cmg18_1 = arith.constant 1024 : index + %c0_i64_m0_cmg18_1 = arith.constant 0 : i64 + %c1_i64_m0_cmg18_1 = arith.constant 1 : i64 + %c32_i64_m0_cmg18_1 = arith.constant 32 : i64 + %c128_i64_m0_cmg18_1 = arith.constant 128 : i64 + %c4096_i64_m0_cmg18_1 = arith.constant 4096 : i64 + %c1000_i32_m0_cmg18_1 = arith.constant 1000 : i32 + %cst_m0_cmg18_1 = arith.constant 3.140000e+00 : f32 + + %ub_in_m0_cmg18_1 = pto.castptr %c0_i64_m0_cmg18_1 : i64 -> !pto.ptr + %ub_out_m0_cmg18_1 = pto.castptr %c4096_i64_m0_cmg18_1 : i64 -> !pto.ptr + + %false_m0_cmg18_1 = arith.constant false + pto.mte_gm_ub %arg0, %ub_in_m0_cmg18_1, %c0_i64_m0_cmg18_1, %c128_i64_m0_cmg18_1 + nburst(%c32_i64_m0_cmg18_1, %c128_i64_m0_cmg18_1, %c128_i64_m0_cmg18_1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %__m0_cmg18_1:1 = scf.for %offset_m0_cmg18_1 = %c0_m0_cmg18_1 to %c1024_m0_cmg18_1 step %c64_m0_cmg18_1 iter_args(%remaining_m0_cmg18_1 = %c1000_i32_m0_cmg18_1) -> (i32) { + %mask_m0_cmg18_1, %next_remaining_m0_cmg18_1 = pto.plt_b32 %remaining_m0_cmg18_1 : i32 -> !pto.mask, i32 + %vec_m0_cmg18_1 = pto.vlds %ub_in_m0_cmg18_1[%offset_m0_cmg18_1] : !pto.ptr -> !pto.vreg<64xf32> + %minv_m0_cmg18_1 = pto.vmins %vec_m0_cmg18_1, %cst_m0_cmg18_1, %mask_m0_cmg18_1 : !pto.vreg<64xf32>, f32, !pto.mask -> !pto.vreg<64xf32> + pto.vsts %minv_m0_cmg18_1, %ub_out_m0_cmg18_1[%offset_m0_cmg18_1], %mask_m0_cmg18_1 : !pto.vreg<64xf32>, !pto.ptr, !pto.mask + scf.yield %next_remaining_m0_cmg18_1 : i32 + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m0_cmg18_1, %arg1, %c128_i64_m0_cmg18_1 + nburst(%c32_i64_m0_cmg18_1, %c128_i64_m0_cmg18_1, %c128_i64_m0_cmg18_1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + + } + // active merged from vec_min_scalar_kernel_2d + + %c0_m1_cmg18_1 = arith.constant 0 : index + %c1_m1_cmg18_1 = arith.constant 1 : index + %c64_m1_cmg18_1 = arith.constant 64 : index + %c1024_m1_cmg18_1 = arith.constant 1024 : index + %c0_i64_m1_cmg18_1 = arith.constant 0 : i64 + %c1_i64_m1_cmg18_1 = arith.constant 1 : i64 + %c32_i64_m1_cmg18_1 = arith.constant 32 : i64 + %c128_i64_m1_cmg18_1 = arith.constant 128 : i64 + %c4096_i64_m1_cmg18_1 = arith.constant 4096 : i64 + %c1024_i32_m1_cmg18_1 = arith.constant 1024 : i32 + %cst_m1_cmg18_1 = arith.constant 3.140000e+00 : f32 + + %ub_in_m1_cmg18_1 = pto.castptr %c0_i64_m1_cmg18_1 : i64 -> !pto.ptr + %ub_out_m1_cmg18_1 = pto.castptr %c4096_i64_m1_cmg18_1 : i64 -> !pto.ptr + + %false_m1_cmg18_1 = arith.constant false + pto.mte_gm_ub %arg2, %ub_in_m1_cmg18_1, %c0_i64_m1_cmg18_1, %c128_i64_m1_cmg18_1 + nburst(%c32_i64_m1_cmg18_1, %c128_i64_m1_cmg18_1, %c128_i64_m1_cmg18_1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %__m1_cmg18_1:1 = scf.for %offset_m1_cmg18_1 = %c0_m1_cmg18_1 to %c1024_m1_cmg18_1 step %c64_m1_cmg18_1 iter_args(%remaining_m1_cmg18_1 = %c1024_i32_m1_cmg18_1) -> (i32) { + %mask_m1_cmg18_1, %next_remaining_m1_cmg18_1 = pto.plt_b32 %remaining_m1_cmg18_1 : i32 -> !pto.mask, i32 + %vec_m1_cmg18_1 = pto.vlds %ub_in_m1_cmg18_1[%offset_m1_cmg18_1] : !pto.ptr -> !pto.vreg<64xf32> + %minv_m1_cmg18_1 = pto.vmins %vec_m1_cmg18_1, %cst_m1_cmg18_1, %mask_m1_cmg18_1 : !pto.vreg<64xf32>, f32, !pto.mask -> !pto.vreg<64xf32> + pto.vsts %minv_m1_cmg18_1, %ub_out_m1_cmg18_1[%offset_m1_cmg18_1], %mask_m1_cmg18_1 : !pto.vreg<64xf32>, !pto.ptr, !pto.mask + scf.yield %next_remaining_m1_cmg18_1 : i32 + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m1_cmg18_1, %arg3, %c128_i64_m1_cmg18_1 + nburst(%c32_i64_m1_cmg18_1, %c128_i64_m1_cmg18_1, %c128_i64_m1_cmg18_1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + } + + // inactive cross-case merge from micro-op/vec-scalar/vmuls + scf.if %__case_merge_guard { + + %__deep_merge_guard_cmg18_2 = arith.constant false + // inactive merged from vmuls_tail_kernel_2d + scf.if %__deep_merge_guard_cmg18_2 { + + %c0_m0_cmg18_2 = arith.constant 0 : index + %c64_m0_cmg18_2 = arith.constant 64 : index + %c1024_m0_cmg18_2 = arith.constant 1024 : index + %c0_i64_m0_cmg18_2 = arith.constant 0 : i64 + %c1_i64_m0_cmg18_2 = arith.constant 1 : i64 + %c32_i64_m0_cmg18_2 = arith.constant 32 : i64 + %c128_i64_m0_cmg18_2 = arith.constant 128 : i64 + %c4096_i64_m0_cmg18_2 = arith.constant 4096 : i64 + %c1000_i32_m0_cmg18_2 = arith.constant 1000 : i32 + %cst_m0_cmg18_2 = arith.constant 3.140000e+00 : f32 + + %ub_in_m0_cmg18_2 = pto.castptr %c0_i64_m0_cmg18_2 : i64 -> !pto.ptr + %ub_out_m0_cmg18_2 = pto.castptr %c4096_i64_m0_cmg18_2 : i64 -> !pto.ptr + + %false_m0_cmg18_2 = arith.constant false + pto.mte_gm_ub %arg0, %ub_in_m0_cmg18_2, %c0_i64_m0_cmg18_2, %c128_i64_m0_cmg18_2 + nburst(%c32_i64_m0_cmg18_2, %c128_i64_m0_cmg18_2, %c128_i64_m0_cmg18_2) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %__m0_cmg18_2:1 = scf.for %offset_m0_cmg18_2 = %c0_m0_cmg18_2 to %c1024_m0_cmg18_2 step %c64_m0_cmg18_2 iter_args(%remaining_m0_cmg18_2 = %c1000_i32_m0_cmg18_2) -> (i32) { + %mask_m0_cmg18_2, %next_remaining_m0_cmg18_2 = pto.plt_b32 %remaining_m0_cmg18_2 : i32 -> !pto.mask, i32 + %vec_m0_cmg18_2 = pto.vlds %ub_in_m0_cmg18_2[%offset_m0_cmg18_2] : !pto.ptr -> !pto.vreg<64xf32> + %prod_m0_cmg18_2 = pto.vmuls %vec_m0_cmg18_2, %cst_m0_cmg18_2, %mask_m0_cmg18_2 : !pto.vreg<64xf32>, f32, !pto.mask -> !pto.vreg<64xf32> + pto.vsts %prod_m0_cmg18_2, %ub_out_m0_cmg18_2[%offset_m0_cmg18_2], %mask_m0_cmg18_2 : !pto.vreg<64xf32>, !pto.ptr, !pto.mask + scf.yield %next_remaining_m0_cmg18_2 : i32 + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m0_cmg18_2, %arg1, %c128_i64_m0_cmg18_2 + nburst(%c32_i64_m0_cmg18_2, %c128_i64_m0_cmg18_2, %c128_i64_m0_cmg18_2) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + + } + // active merged from vec_mul_scalar_kernel_2d + + %c0_m1_cmg18_2 = arith.constant 0 : index + %c1_m1_cmg18_2 = arith.constant 1 : index + %c64_m1_cmg18_2 = arith.constant 64 : index + %c1024_m1_cmg18_2 = arith.constant 1024 : index + %c0_i64_m1_cmg18_2 = arith.constant 0 : i64 + %c1_i64_m1_cmg18_2 = arith.constant 1 : i64 + %c32_i64_m1_cmg18_2 = arith.constant 32 : i64 + %c128_i64_m1_cmg18_2 = arith.constant 128 : i64 + %c4096_i64_m1_cmg18_2 = arith.constant 4096 : i64 + %c1024_i32_m1_cmg18_2 = arith.constant 1024 : i32 + %cst_m1_cmg18_2 = arith.constant 3.140000e+00 : f32 + + %ub_in_m1_cmg18_2 = pto.castptr %c0_i64_m1_cmg18_2 : i64 -> !pto.ptr + %ub_out_m1_cmg18_2 = pto.castptr %c4096_i64_m1_cmg18_2 : i64 -> !pto.ptr + + %false_m1_cmg18_2 = arith.constant false + pto.mte_gm_ub %arg2, %ub_in_m1_cmg18_2, %c0_i64_m1_cmg18_2, %c128_i64_m1_cmg18_2 + nburst(%c32_i64_m1_cmg18_2, %c128_i64_m1_cmg18_2, %c128_i64_m1_cmg18_2) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %__m1_cmg18_2:1 = scf.for %offset_m1_cmg18_2 = %c0_m1_cmg18_2 to %c1024_m1_cmg18_2 step %c64_m1_cmg18_2 iter_args(%remaining_m1_cmg18_2 = %c1024_i32_m1_cmg18_2) -> (i32) { + %mask_m1_cmg18_2, %next_remaining_m1_cmg18_2 = pto.plt_b32 %remaining_m1_cmg18_2 : i32 -> !pto.mask, i32 + %vec_m1_cmg18_2 = pto.vlds %ub_in_m1_cmg18_2[%offset_m1_cmg18_2] : !pto.ptr -> !pto.vreg<64xf32> + %prod_m1_cmg18_2 = pto.vmuls %vec_m1_cmg18_2, %cst_m1_cmg18_2, %mask_m1_cmg18_2 : !pto.vreg<64xf32>, f32, !pto.mask -> !pto.vreg<64xf32> + pto.vsts %prod_m1_cmg18_2, %ub_out_m1_cmg18_2[%offset_m1_cmg18_2], %mask_m1_cmg18_2 : !pto.vreg<64xf32>, !pto.ptr, !pto.mask + scf.yield %next_remaining_m1_cmg18_2 : i32 + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m1_cmg18_2, %arg3, %c128_i64_m1_cmg18_2 + nburst(%c32_i64_m1_cmg18_2, %c128_i64_m1_cmg18_2, %c128_i64_m1_cmg18_2) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + } + return } } diff --git a/test/vpto/cases/micro-op/vec-scalar/vmaxs/launch.cpp b/test/vpto/cases/micro-op/vec-scalar/vmaxs/launch.cpp index a08848672..56d1a3371 100644 --- a/test/vpto/cases/micro-op/vec-scalar/vmaxs/launch.cpp +++ b/test/vpto/cases/micro-op/vec-scalar/vmaxs/launch.cpp @@ -5,11 +5,9 @@ // THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, // INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. // See LICENSE in the root of the software repository for the full text of the License. - #ifndef __VEC_SCOPE__ #define __VEC_SCOPE__ #endif - #if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) typedef struct { unsigned char v; } hifloat8_t; typedef struct { unsigned char v; } float8_e4m3_t; @@ -22,23 +20,23 @@ typedef struct { unsigned char v; } float4_e2m1x2_t; #if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) #include #endif - #if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; +struct MrgSortExecutedNumList { uint16_t mrgSortList0,mrgSortList1,mrgSortList2,mrgSortList3; }; #endif #ifndef __CPU_SIM #include "acl/acl.h" #endif -extern "C" __global__ [aicore] void vec_max_scalar_kernel_2d(__gm__ float *v1, - __gm__ float *v2); +extern "C" __global__ [aicore] void vmaxs_deep_merged_kernel( + __gm__ float * arg0, + __gm__ float * arg1, + __gm__ float * arg2, + __gm__ float * arg3); -void LaunchVec_max_scalar_kernel_2d(float *v1, float *v2, void *stream) { - vec_max_scalar_kernel_2d<<<1, nullptr, stream>>>((__gm__ float *)v1, - (__gm__ float *)v2); +void LaunchVmaxsDeepMerged(float * p0, float * p1, void *stream) { + vmaxs_deep_merged_kernel<<<1, nullptr, stream>>>( + (__gm__ float *)p0, + (__gm__ float *)p0, + (__gm__ float *)p0, + (__gm__ float *)p1); } diff --git a/test/vpto/cases/micro-op/vec-scalar/vmaxs/main.cpp b/test/vpto/cases/micro-op/vec-scalar/vmaxs/main.cpp index 47ce3f58b..0cf9f093e 100644 --- a/test/vpto/cases/micro-op/vec-scalar/vmaxs/main.cpp +++ b/test/vpto/cases/micro-op/vec-scalar/vmaxs/main.cpp @@ -28,8 +28,8 @@ using namespace PtoTestCommon; } \ } while (0) -void LaunchVec_max_scalar_kernel_2d(float *v1, float *v2, void *stream); +void LaunchVmaxsDeepMerged(float * p0, float * p1, void *stream); int main() { size_t elemCount_v1 = 1024; size_t fileSize_v1 = elemCount_v1 * sizeof(float); @@ -62,7 +62,7 @@ int main() { ACL_MEMCPY_HOST_TO_DEVICE)); ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVec_max_scalar_kernel_2d(v1Device, v2Device, stream); + LaunchVmaxsDeepMerged(v1Device, v2Device, stream); ACL_CHECK(aclrtSynchronizeStream(stream)); ACL_CHECK(aclrtMemcpy(v2Host, fileSize_v2, v2Device, fileSize_v2, ACL_MEMCPY_DEVICE_TO_HOST)); diff --git a/test/vpto/cases/micro-op/vec-scalar/vmins-tail/compare.py b/test/vpto/cases/micro-op/vec-scalar/vmins-tail/compare.py deleted file mode 100644 index c13d79273..000000000 --- a/test/vpto/cases/micro-op/vec-scalar/vmins-tail/compare.py +++ /dev/null @@ -1,39 +0,0 @@ -#!/usr/bin/python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - - -import os -import sys -import numpy as np - - -def compare_bin_prefix(golden_path, output_path, dtype, eps, count): - if not os.path.exists(golden_path) or not os.path.exists(output_path): - return False - golden = np.fromfile(golden_path, dtype=dtype, count=count) - output = np.fromfile(output_path, dtype=dtype, count=count) - return golden.shape == output.shape and np.allclose( - golden, output, atol=eps, rtol=eps, equal_nan=True - ) - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_bin_prefix("golden_v2.bin", "v2.bin", np.float32, 1e-4, 1000) - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/vec-scalar/vmins-tail/golden.py b/test/vpto/cases/micro-op/vec-scalar/vmins-tail/golden.py deleted file mode 100644 index e4e63235a..000000000 --- a/test/vpto/cases/micro-op/vec-scalar/vmins-tail/golden.py +++ /dev/null @@ -1,50 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -ROWS = 32 -COLS = 32 -SEED = 19 -SCALE = np.float32(3.14) -LOGICAL_ELEMS = 1000 -OUT_SENTINEL = np.float32(-123.25) - - -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) - v1 = rng.random((ROWS, COLS), dtype=np.float32) - v2 = np.full((ROWS, COLS), OUT_SENTINEL, dtype=np.float32) - golden_v2 = np.full((ROWS, COLS), OUT_SENTINEL, dtype=np.float32) - golden_v2.reshape(-1)[:LOGICAL_ELEMS] = np.minimum( - v1.reshape(-1)[:LOGICAL_ELEMS], SCALE - ).astype(np.float32, copy=False) - - output_dir.mkdir(parents=True, exist_ok=True) - v1.reshape(-1).tofile(output_dir / "v1.bin") - v2.reshape(-1).tofile(output_dir / "v2.bin") - golden_v2.reshape(-1).tofile(output_dir / "golden_v2.bin") - - -def main() -> None: - parser = argparse.ArgumentParser() - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/vec-scalar/vmins-tail/kernel.pto b/test/vpto/cases/micro-op/vec-scalar/vmins-tail/kernel.pto deleted file mode 100644 index 9a6f30010..000000000 --- a/test/vpto/cases/micro-op/vec-scalar/vmins-tail/kernel.pto +++ /dev/null @@ -1,44 +0,0 @@ -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vmins_tail_kernel_2d(%arg0: !pto.ptr, - %arg1: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c64 = arith.constant 64 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c128_i64 = arith.constant 128 : i64 - %c4096_i64 = arith.constant 4096 : i64 - %c1000_i32 = arith.constant 1000 : i32 - %cst = arith.constant 3.140000e+00 : f32 - - %ub_in = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c4096_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_in, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %_:1 = scf.for %offset = %c0 to %c1024 step %c64 iter_args(%remaining = %c1000_i32) -> (i32) { - %mask, %next_remaining = pto.plt_b32 %remaining : i32 -> !pto.mask, i32 - %vec = pto.vlds %ub_in[%offset] : !pto.ptr -> !pto.vreg<64xf32> - %minv = pto.vmins %vec, %cst, %mask : !pto.vreg<64xf32>, f32, !pto.mask -> !pto.vreg<64xf32> - pto.vsts %minv, %ub_out[%offset], %mask : !pto.vreg<64xf32>, !pto.ptr, !pto.mask - scf.yield %next_remaining : i32 - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg1, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/vec-scalar/vmins-tail/launch.cpp b/test/vpto/cases/micro-op/vec-scalar/vmins-tail/launch.cpp deleted file mode 100644 index 2774c3f46..000000000 --- a/test/vpto/cases/micro-op/vec-scalar/vmins-tail/launch.cpp +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vmins_tail_kernel_2d(__gm__ float *v1, - __gm__ float *v2); - -void LaunchVadds_tail_kernel_2d(float *v1, float *v2, void *stream) { - vmins_tail_kernel_2d<<<1, nullptr, stream>>>((__gm__ float *)v1, - (__gm__ float *)v2); -} diff --git a/test/vpto/cases/micro-op/vec-scalar/vmins-tail/main.cpp b/test/vpto/cases/micro-op/vec-scalar/vmins-tail/main.cpp deleted file mode 100644 index ab77e6b1a..000000000 --- a/test/vpto/cases/micro-op/vec-scalar/vmins-tail/main.cpp +++ /dev/null @@ -1,83 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include - -using namespace PtoTestCommon; - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, \ - (int)_ret, __FILE__, __LINE__); \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVadds_tail_kernel_2d(float *v1, float *v2, void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(float); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(float); - float *v1Host = nullptr; - float *v1Device = nullptr; - float *v2Host = nullptr; - float *v2Device = nullptr; - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) - deviceId = std::atoi(envDevice); - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, - ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVadds_tail_kernel_2d(v1Device, v2Device, stream); - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v2Host, fileSize_v2, v2Device, fileSize_v2, - ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v2.bin", v2Host, fileSize_v2); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - if (stream != nullptr) - aclrtDestroyStream(stream); - if (deviceSet) - aclrtResetDevice(deviceId); - if (aclInited) - aclFinalize(); - return rc; -} diff --git a/test/vpto/cases/micro-op/vec-scalar/vmins/compare.py b/test/vpto/cases/micro-op/vec-scalar/vmins/compare.py deleted file mode 100644 index 15b793fac..000000000 --- a/test/vpto/cases/micro-op/vec-scalar/vmins/compare.py +++ /dev/null @@ -1,37 +0,0 @@ -#!/usr/bin/python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - - -import os -import sys -import numpy as np - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(golden_path) or not os.path.exists(output_path): - return False - golden = np.fromfile(golden_path, dtype=dtype) - output = np.fromfile(output_path, dtype=dtype) - return golden.shape == output.shape and np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True) - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_bin("golden_v2.bin", "v2.bin", np.float32, 1e-4) - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/vec-scalar/vmins/golden.py b/test/vpto/cases/micro-op/vec-scalar/vmins/golden.py deleted file mode 100644 index 7caa057f7..000000000 --- a/test/vpto/cases/micro-op/vec-scalar/vmins/golden.py +++ /dev/null @@ -1,45 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -ROWS = 32 -COLS = 32 -SEED = 19 -SCALE = np.float32(3.14) - - -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) - v1 = rng.uniform(-8.0, 8.0, size=(ROWS, COLS)).astype(np.float32) - v2 = np.zeros((ROWS, COLS), dtype=np.float32) - golden_v2 = np.minimum(v1, SCALE).astype(np.float32, copy=False) - - output_dir.mkdir(parents=True, exist_ok=True) - v1.reshape(-1).tofile(output_dir / "v1.bin") - v2.reshape(-1).tofile(output_dir / "v2.bin") - golden_v2.reshape(-1).tofile(output_dir / "golden_v2.bin") - - -def main() -> None: - parser = argparse.ArgumentParser() - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/vec-scalar/vmins/kernel.pto b/test/vpto/cases/micro-op/vec-scalar/vmins/kernel.pto deleted file mode 100644 index 5fa5b3613..000000000 --- a/test/vpto/cases/micro-op/vec-scalar/vmins/kernel.pto +++ /dev/null @@ -1,45 +0,0 @@ -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vec_min_scalar_kernel_2d(%arg0: !pto.ptr, - %arg1: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c64 = arith.constant 64 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c128_i64 = arith.constant 128 : i64 - %c4096_i64 = arith.constant 4096 : i64 - %c1024_i32 = arith.constant 1024 : i32 - %cst = arith.constant 3.140000e+00 : f32 - - %ub_in = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c4096_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_in, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %_:1 = scf.for %offset = %c0 to %c1024 step %c64 iter_args(%remaining = %c1024_i32) -> (i32) { - %mask, %next_remaining = pto.plt_b32 %remaining : i32 -> !pto.mask, i32 - %vec = pto.vlds %ub_in[%offset] : !pto.ptr -> !pto.vreg<64xf32> - %minv = pto.vmins %vec, %cst, %mask : !pto.vreg<64xf32>, f32, !pto.mask -> !pto.vreg<64xf32> - pto.vsts %minv, %ub_out[%offset], %mask : !pto.vreg<64xf32>, !pto.ptr, !pto.mask - scf.yield %next_remaining : i32 - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg1, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/vec-scalar/vmins/launch.cpp b/test/vpto/cases/micro-op/vec-scalar/vmins/launch.cpp deleted file mode 100644 index 23603d652..000000000 --- a/test/vpto/cases/micro-op/vec-scalar/vmins/launch.cpp +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vec_min_scalar_kernel_2d(__gm__ float *v1, - __gm__ float *v2); - -void LaunchVec_min_scalar_kernel_2d(float *v1, float *v2, void *stream) { - vec_min_scalar_kernel_2d<<<1, nullptr, stream>>>((__gm__ float *)v1, - (__gm__ float *)v2); -} diff --git a/test/vpto/cases/micro-op/vec-scalar/vmins/main.cpp b/test/vpto/cases/micro-op/vec-scalar/vmins/main.cpp deleted file mode 100644 index 888a58876..000000000 --- a/test/vpto/cases/micro-op/vec-scalar/vmins/main.cpp +++ /dev/null @@ -1,83 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include - -using namespace PtoTestCommon; - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, \ - (int)_ret, __FILE__, __LINE__); \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVec_min_scalar_kernel_2d(float *v1, float *v2, void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(float); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(float); - float *v1Host = nullptr; - float *v1Device = nullptr; - float *v2Host = nullptr; - float *v2Device = nullptr; - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) - deviceId = std::atoi(envDevice); - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, - ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVec_min_scalar_kernel_2d(v1Device, v2Device, stream); - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v2Host, fileSize_v2, v2Device, fileSize_v2, - ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v2.bin", v2Host, fileSize_v2); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - if (stream != nullptr) - aclrtDestroyStream(stream); - if (deviceSet) - aclrtResetDevice(deviceId); - if (aclInited) - aclFinalize(); - return rc; -} diff --git a/test/vpto/cases/micro-op/vec-scalar/vmuls-tail/compare.py b/test/vpto/cases/micro-op/vec-scalar/vmuls-tail/compare.py deleted file mode 100644 index c13d79273..000000000 --- a/test/vpto/cases/micro-op/vec-scalar/vmuls-tail/compare.py +++ /dev/null @@ -1,39 +0,0 @@ -#!/usr/bin/python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - - -import os -import sys -import numpy as np - - -def compare_bin_prefix(golden_path, output_path, dtype, eps, count): - if not os.path.exists(golden_path) or not os.path.exists(output_path): - return False - golden = np.fromfile(golden_path, dtype=dtype, count=count) - output = np.fromfile(output_path, dtype=dtype, count=count) - return golden.shape == output.shape and np.allclose( - golden, output, atol=eps, rtol=eps, equal_nan=True - ) - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_bin_prefix("golden_v2.bin", "v2.bin", np.float32, 1e-4, 1000) - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/vec-scalar/vmuls-tail/golden.py b/test/vpto/cases/micro-op/vec-scalar/vmuls-tail/golden.py deleted file mode 100644 index fdfd56fd5..000000000 --- a/test/vpto/cases/micro-op/vec-scalar/vmuls-tail/golden.py +++ /dev/null @@ -1,50 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -ROWS = 32 -COLS = 32 -SEED = 19 -SCALE = np.float32(3.14) -LOGICAL_ELEMS = 1000 -OUT_SENTINEL = np.float32(-123.25) - - -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) - v1 = rng.random((ROWS, COLS), dtype=np.float32) - v2 = np.full((ROWS, COLS), OUT_SENTINEL, dtype=np.float32) - golden_v2 = np.full((ROWS, COLS), OUT_SENTINEL, dtype=np.float32) - golden_v2.reshape(-1)[:LOGICAL_ELEMS] = ( - v1.reshape(-1)[:LOGICAL_ELEMS] * SCALE - ).astype(np.float32, copy=False) - - output_dir.mkdir(parents=True, exist_ok=True) - v1.reshape(-1).tofile(output_dir / "v1.bin") - v2.reshape(-1).tofile(output_dir / "v2.bin") - golden_v2.reshape(-1).tofile(output_dir / "golden_v2.bin") - - -def main() -> None: - parser = argparse.ArgumentParser() - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/vec-scalar/vmuls-tail/kernel.pto b/test/vpto/cases/micro-op/vec-scalar/vmuls-tail/kernel.pto deleted file mode 100644 index 92efaba1b..000000000 --- a/test/vpto/cases/micro-op/vec-scalar/vmuls-tail/kernel.pto +++ /dev/null @@ -1,44 +0,0 @@ -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vmuls_tail_kernel_2d(%arg0: !pto.ptr, - %arg1: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c64 = arith.constant 64 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c128_i64 = arith.constant 128 : i64 - %c4096_i64 = arith.constant 4096 : i64 - %c1000_i32 = arith.constant 1000 : i32 - %cst = arith.constant 3.140000e+00 : f32 - - %ub_in = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c4096_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_in, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %_:1 = scf.for %offset = %c0 to %c1024 step %c64 iter_args(%remaining = %c1000_i32) -> (i32) { - %mask, %next_remaining = pto.plt_b32 %remaining : i32 -> !pto.mask, i32 - %vec = pto.vlds %ub_in[%offset] : !pto.ptr -> !pto.vreg<64xf32> - %prod = pto.vmuls %vec, %cst, %mask : !pto.vreg<64xf32>, f32, !pto.mask -> !pto.vreg<64xf32> - pto.vsts %prod, %ub_out[%offset], %mask : !pto.vreg<64xf32>, !pto.ptr, !pto.mask - scf.yield %next_remaining : i32 - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg1, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/vec-scalar/vmuls-tail/launch.cpp b/test/vpto/cases/micro-op/vec-scalar/vmuls-tail/launch.cpp deleted file mode 100644 index 65f00d71a..000000000 --- a/test/vpto/cases/micro-op/vec-scalar/vmuls-tail/launch.cpp +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vmuls_tail_kernel_2d(__gm__ float *v1, - __gm__ float *v2); - -void LaunchVadds_tail_kernel_2d(float *v1, float *v2, void *stream) { - vmuls_tail_kernel_2d<<<1, nullptr, stream>>>((__gm__ float *)v1, - (__gm__ float *)v2); -} diff --git a/test/vpto/cases/micro-op/vec-scalar/vmuls-tail/main.cpp b/test/vpto/cases/micro-op/vec-scalar/vmuls-tail/main.cpp deleted file mode 100644 index ab77e6b1a..000000000 --- a/test/vpto/cases/micro-op/vec-scalar/vmuls-tail/main.cpp +++ /dev/null @@ -1,83 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include - -using namespace PtoTestCommon; - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, \ - (int)_ret, __FILE__, __LINE__); \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVadds_tail_kernel_2d(float *v1, float *v2, void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(float); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(float); - float *v1Host = nullptr; - float *v1Device = nullptr; - float *v2Host = nullptr; - float *v2Device = nullptr; - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) - deviceId = std::atoi(envDevice); - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, - ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVadds_tail_kernel_2d(v1Device, v2Device, stream); - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v2Host, fileSize_v2, v2Device, fileSize_v2, - ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v2.bin", v2Host, fileSize_v2); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - if (stream != nullptr) - aclrtDestroyStream(stream); - if (deviceSet) - aclrtResetDevice(deviceId); - if (aclInited) - aclFinalize(); - return rc; -} diff --git a/test/vpto/cases/micro-op/vec-scalar/vmuls/compare.py b/test/vpto/cases/micro-op/vec-scalar/vmuls/compare.py deleted file mode 100644 index 15b793fac..000000000 --- a/test/vpto/cases/micro-op/vec-scalar/vmuls/compare.py +++ /dev/null @@ -1,37 +0,0 @@ -#!/usr/bin/python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - - -import os -import sys -import numpy as np - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(golden_path) or not os.path.exists(output_path): - return False - golden = np.fromfile(golden_path, dtype=dtype) - output = np.fromfile(output_path, dtype=dtype) - return golden.shape == output.shape and np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True) - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_bin("golden_v2.bin", "v2.bin", np.float32, 1e-4) - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/vec-scalar/vmuls/golden.py b/test/vpto/cases/micro-op/vec-scalar/vmuls/golden.py deleted file mode 100644 index 5233be0ed..000000000 --- a/test/vpto/cases/micro-op/vec-scalar/vmuls/golden.py +++ /dev/null @@ -1,45 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -ROWS = 32 -COLS = 32 -SEED = 19 -SCALE = np.float32(3.14) - - -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) - v1 = rng.uniform(-8.0, 8.0, size=(ROWS, COLS)).astype(np.float32) - v2 = np.zeros((ROWS, COLS), dtype=np.float32) - golden_v2 = (v1 * SCALE).astype(np.float32, copy=False) - - output_dir.mkdir(parents=True, exist_ok=True) - v1.reshape(-1).tofile(output_dir / "v1.bin") - v2.reshape(-1).tofile(output_dir / "v2.bin") - golden_v2.reshape(-1).tofile(output_dir / "golden_v2.bin") - - -def main() -> None: - parser = argparse.ArgumentParser() - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/vec-scalar/vmuls/kernel.pto b/test/vpto/cases/micro-op/vec-scalar/vmuls/kernel.pto deleted file mode 100644 index 4e6461f2f..000000000 --- a/test/vpto/cases/micro-op/vec-scalar/vmuls/kernel.pto +++ /dev/null @@ -1,45 +0,0 @@ -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vec_mul_scalar_kernel_2d(%arg0: !pto.ptr, - %arg1: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c64 = arith.constant 64 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c128_i64 = arith.constant 128 : i64 - %c4096_i64 = arith.constant 4096 : i64 - %c1024_i32 = arith.constant 1024 : i32 - %cst = arith.constant 3.140000e+00 : f32 - - %ub_in = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c4096_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_in, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %_:1 = scf.for %offset = %c0 to %c1024 step %c64 iter_args(%remaining = %c1024_i32) -> (i32) { - %mask, %next_remaining = pto.plt_b32 %remaining : i32 -> !pto.mask, i32 - %vec = pto.vlds %ub_in[%offset] : !pto.ptr -> !pto.vreg<64xf32> - %prod = pto.vmuls %vec, %cst, %mask : !pto.vreg<64xf32>, f32, !pto.mask -> !pto.vreg<64xf32> - pto.vsts %prod, %ub_out[%offset], %mask : !pto.vreg<64xf32>, !pto.ptr, !pto.mask - scf.yield %next_remaining : i32 - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg1, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/vec-scalar/vmuls/launch.cpp b/test/vpto/cases/micro-op/vec-scalar/vmuls/launch.cpp deleted file mode 100644 index 0146a24b5..000000000 --- a/test/vpto/cases/micro-op/vec-scalar/vmuls/launch.cpp +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vec_mul_scalar_kernel_2d(__gm__ float *v1, - __gm__ float *v2); - -void LaunchVec_mul_scalar_kernel_2d(float *v1, float *v2, void *stream) { - vec_mul_scalar_kernel_2d<<<1, nullptr, stream>>>((__gm__ float *)v1, - (__gm__ float *)v2); -} diff --git a/test/vpto/cases/micro-op/vec-scalar/vmuls/main.cpp b/test/vpto/cases/micro-op/vec-scalar/vmuls/main.cpp deleted file mode 100644 index e99b6c097..000000000 --- a/test/vpto/cases/micro-op/vec-scalar/vmuls/main.cpp +++ /dev/null @@ -1,83 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include - -using namespace PtoTestCommon; - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, \ - (int)_ret, __FILE__, __LINE__); \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVec_mul_scalar_kernel_2d(float *v1, float *v2, void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(float); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(float); - float *v1Host = nullptr; - float *v1Device = nullptr; - float *v2Host = nullptr; - float *v2Device = nullptr; - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) - deviceId = std::atoi(envDevice); - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, - ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVec_mul_scalar_kernel_2d(v1Device, v2Device, stream); - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v2Host, fileSize_v2, v2Device, fileSize_v2, - ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v2.bin", v2Host, fileSize_v2); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - if (stream != nullptr) - aclrtDestroyStream(stream); - if (deviceSet) - aclrtResetDevice(deviceId); - if (aclInited) - aclFinalize(); - return rc; -} diff --git a/test/vpto/cases/micro-op/vec-scalar/vshls-shift-boundary/compare.py b/test/vpto/cases/micro-op/vec-scalar/vshls-shift-boundary/compare.py deleted file mode 100644 index f07dd1f4c..000000000 --- a/test/vpto/cases/micro-op/vec-scalar/vshls-shift-boundary/compare.py +++ /dev/null @@ -1,41 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/vec-scalar/vshls-shift-boundary -# family: vec-scalar -# target_ops: pto.vshls -# scenarios: core-i16-unsigned, full-mask, scalar-operand, shift-boundary - -import os -import sys -import numpy as np - - -def compare_bin(golden_path, output_path, dtype): - if not os.path.exists(golden_path) or not os.path.exists(output_path): - return False - golden = np.fromfile(golden_path, dtype=dtype) - output = np.fromfile(output_path, dtype=dtype) - return golden.shape == output.shape and np.array_equal(golden, output) - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_bin("golden_v2.bin", "v2.bin", np.uint16) - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/vec-scalar/vshls-shift-boundary/golden.py b/test/vpto/cases/micro-op/vec-scalar/vshls-shift-boundary/golden.py deleted file mode 100644 index e0952743a..000000000 --- a/test/vpto/cases/micro-op/vec-scalar/vshls-shift-boundary/golden.py +++ /dev/null @@ -1,51 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/vec-scalar/vshls-shift-boundary -# family: vec-scalar -# target_ops: pto.vshls -# scenarios: core-i16-unsigned, full-mask, scalar-operand, shift-boundary - -import argparse -from pathlib import Path - -import numpy as np - - -ELEMS = 1024 -SHIFT = 15 -PATTERN = np.array( - [0x0000, 0x0001, 0x0002, 0x0003, 0x7FFF, 0x8000, 0x8001, 0xFFFF], - dtype=np.uint16, -) - - -def generate(output_dir: Path, seed: int) -> None: - del seed - repeats = ELEMS // PATTERN.size - v1 = np.tile(PATTERN, repeats) - v2 = np.zeros(ELEMS, dtype=np.uint16) - golden_v2 = np.left_shift(v1.astype(np.uint32), SHIFT).astype(np.uint16) - - output_dir.mkdir(parents=True, exist_ok=True) - v1.tofile(output_dir / "v1.bin") - v2.tofile(output_dir / "v2.bin") - golden_v2.tofile(output_dir / "golden_v2.bin") - - -def main() -> None: - parser = argparse.ArgumentParser() - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=19) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/vec-scalar/vshls-shift-boundary/kernel.pto b/test/vpto/cases/micro-op/vec-scalar/vshls-shift-boundary/kernel.pto deleted file mode 100644 index e25159f5e..000000000 --- a/test/vpto/cases/micro-op/vec-scalar/vshls-shift-boundary/kernel.pto +++ /dev/null @@ -1,47 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/vec-scalar/vshls-shift-boundary -// family: vec-scalar -// target_ops: pto.vshls -// scenarios: core-i16-unsigned, full-mask, scalar-operand, shift-boundary -// ----------------------------------------------------------------------------- -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vshls_shift_boundary_kernel(%arg0: !pto.ptr, - %arg1: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c128 = arith.constant 128 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c64_i64 = arith.constant 64 : i64 - %c2048_i64 = arith.constant 2048 : i64 - %scalar = arith.constant 15 : i16 - %ub_in = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c2048_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_in, %c0_i64, %c64_i64 - nburst(%c32_i64, %c64_i64, %c64_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %mask = pto.pset_b16 "PAT_ALL" : !pto.mask - scf.for %offset = %c0 to %c1024 step %c128 { - %vec = pto.vlds %ub_in[%offset] : !pto.ptr -> !pto.vreg<128xui16> - %shifted = pto.vshls %vec, %scalar, %mask : !pto.vreg<128xui16>, i16, !pto.mask -> !pto.vreg<128xui16> - pto.vsts %shifted, %ub_out[%offset], %mask : !pto.vreg<128xui16>, !pto.ptr, !pto.mask - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg1, %c64_i64 - nburst(%c32_i64, %c64_i64, %c64_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/vec-scalar/vshls-shift-boundary/launch.cpp b/test/vpto/cases/micro-op/vec-scalar/vshls-shift-boundary/launch.cpp deleted file mode 100644 index ee7141d19..000000000 --- a/test/vpto/cases/micro-op/vec-scalar/vshls-shift-boundary/launch.cpp +++ /dev/null @@ -1,52 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/vec-scalar/vshls-shift-boundary -// family: vec-scalar -// target_ops: pto.vshls -// scenarios: core-i16-unsigned, full-mask, scalar-operand -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vshls_shift_boundary_kernel(__gm__ float *v1, - __gm__ float *v2); - -void LaunchVshls_shift_boundary_kernel(float *v1, float *v2, void *stream) { - vshls_shift_boundary_kernel<<<1, nullptr, stream>>>((__gm__ float *)v1, - (__gm__ float *)v2); -} diff --git a/test/vpto/cases/micro-op/vec-scalar/vshls-shift-boundary/main.cpp b/test/vpto/cases/micro-op/vec-scalar/vshls-shift-boundary/main.cpp deleted file mode 100644 index 3b51b0c33..000000000 --- a/test/vpto/cases/micro-op/vec-scalar/vshls-shift-boundary/main.cpp +++ /dev/null @@ -1,91 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/vec-scalar/vshls-shift-boundary -// family: vec-scalar -// target_ops: pto.vshls -// scenarios: core-i16-unsigned, full-mask, scalar-operand -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include - -using namespace PtoTestCommon; - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, \ - (int)_ret, __FILE__, __LINE__); \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVshls_shift_boundary_kernel(float *v1, float *v2, void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(float); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(float); - float *v1Host = nullptr; - float *v1Device = nullptr; - float *v2Host = nullptr; - float *v2Device = nullptr; - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) - deviceId = std::atoi(envDevice); - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, - ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVshls_shift_boundary_kernel(v1Device, v2Device, stream); - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v2Host, fileSize_v2, v2Device, fileSize_v2, - ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v2.bin", v2Host, fileSize_v2); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - if (stream != nullptr) - aclrtDestroyStream(stream); - if (deviceSet) - aclrtResetDevice(deviceId); - if (aclInited) - aclFinalize(); - return rc; -} diff --git a/test/vpto/cases/micro-op/vec-scalar/vshls/kernel.pto b/test/vpto/cases/micro-op/vec-scalar/vshls/kernel.pto index dd3ff545d..12ee8a86e 100644 --- a/test/vpto/cases/micro-op/vec-scalar/vshls/kernel.pto +++ b/test/vpto/cases/micro-op/vec-scalar/vshls/kernel.pto @@ -1,47 +1,168 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/vec-scalar/vshls -// family: vec-scalar -// target_ops: pto.vshls -// scenarios: core-i16-unsigned, full-mask, scalar-operand -// ----------------------------------------------------------------------------- module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vshls_kernel(%arg0: !pto.ptr, - %arg1: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c128 = arith.constant 128 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c64_i64 = arith.constant 64 : i64 - %c2048_i64 = arith.constant 2048 : i64 - %scalar = arith.constant 3 : i16 - %ub_in = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c2048_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_in, %c0_i64, %c64_i64 - nburst(%c32_i64, %c64_i64, %c64_i64) + func.func @vshls_deep_merged_kernel(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: !pto.ptr) attributes {pto.kernel} { + %__deep_merge_guard = arith.constant false + // inactive merged from vshls_shift_boundary_kernel + scf.if %__deep_merge_guard { + + %c0_m0 = arith.constant 0 : index + %c128_m0 = arith.constant 128 : index + %c1024_m0 = arith.constant 1024 : index + %c0_i64_m0 = arith.constant 0 : i64 + %c1_i64_m0 = arith.constant 1 : i64 + %c32_i64_m0 = arith.constant 32 : i64 + %c64_i64_m0 = arith.constant 64 : i64 + %c2048_i64_m0 = arith.constant 2048 : i64 + %scalar_m0 = arith.constant 15 : i16 + %ub_in_m0 = pto.castptr %c0_i64_m0 : i64 -> !pto.ptr + %ub_out_m0 = pto.castptr %c2048_i64_m0 : i64 -> !pto.ptr + + %false_m0 = arith.constant false + pto.mte_gm_ub %arg0, %ub_in_m0, %c0_i64_m0, %c64_i64_m0 + nburst(%c32_i64_m0, %c64_i64_m0, %c64_i64_m0) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %mask_m0 = pto.pset_b16 "PAT_ALL" : !pto.mask + scf.for %offset_m0 = %c0_m0 to %c1024_m0 step %c128_m0 { + %vec_m0 = pto.vlds %ub_in_m0[%offset_m0] : !pto.ptr -> !pto.vreg<128xui16> + %shifted_m0 = pto.vshls %vec_m0, %scalar_m0, %mask_m0 : !pto.vreg<128xui16>, i16, !pto.mask -> !pto.vreg<128xui16> + pto.vsts %shifted_m0, %ub_out_m0[%offset_m0], %mask_m0 : !pto.vreg<128xui16>, !pto.ptr, !pto.mask + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m0, %arg1, %c64_i64_m0 + nburst(%c32_i64_m0, %c64_i64_m0, %c64_i64_m0) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + + } + // active merged from vshls_kernel + + %c0_m1 = arith.constant 0 : index + %c128_m1 = arith.constant 128 : index + %c1024_m1 = arith.constant 1024 : index + %c0_i64_m1 = arith.constant 0 : i64 + %c1_i64_m1 = arith.constant 1 : i64 + %c32_i64_m1 = arith.constant 32 : i64 + %c64_i64_m1 = arith.constant 64 : i64 + %c2048_i64_m1 = arith.constant 2048 : i64 + %scalar_m1 = arith.constant 3 : i16 + %ub_in_m1 = pto.castptr %c0_i64_m1 : i64 -> !pto.ptr + %ub_out_m1 = pto.castptr %c2048_i64_m1 : i64 -> !pto.ptr + + %false_m1 = arith.constant false + pto.mte_gm_ub %arg2, %ub_in_m1, %c0_i64_m1, %c64_i64_m1 + nburst(%c32_i64_m1, %c64_i64_m1, %c64_i64_m1) : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] pto.vecscope { - %mask = pto.pset_b16 "PAT_ALL" : !pto.mask - scf.for %offset = %c0 to %c1024 step %c128 { - %vec = pto.vlds %ub_in[%offset] : !pto.ptr -> !pto.vreg<128xui16> - %shifted = pto.vshls %vec, %scalar, %mask : !pto.vreg<128xui16>, i16, !pto.mask -> !pto.vreg<128xui16> - pto.vsts %shifted, %ub_out[%offset], %mask : !pto.vreg<128xui16>, !pto.ptr, !pto.mask + %mask_m1 = pto.pset_b16 "PAT_ALL" : !pto.mask + scf.for %offset_m1 = %c0_m1 to %c1024_m1 step %c128_m1 { + %vec_m1 = pto.vlds %ub_in_m1[%offset_m1] : !pto.ptr -> !pto.vreg<128xui16> + %shifted_m1 = pto.vshls %vec_m1, %scalar_m1, %mask_m1 : !pto.vreg<128xui16>, i16, !pto.mask -> !pto.vreg<128xui16> + pto.vsts %shifted_m1, %ub_out_m1[%offset_m1], %mask_m1 : !pto.vreg<128xui16>, !pto.ptr, !pto.mask } } pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg1, %c64_i64 - nburst(%c32_i64, %c64_i64, %c64_i64) + pto.mte_ub_gm %ub_out_m1, %arg3, %c64_i64_m1 + nburst(%c32_i64_m1, %c64_i64_m1, %c64_i64_m1) : !pto.ptr, !pto.ptr, i64, i64, i64, i64 pto.barrier #pto.pipe + %__case_merge_guard = arith.constant false + + // inactive cross-case merge from micro-op/vec-scalar/vshrs + scf.if %__case_merge_guard { + + %__deep_merge_guard_cmg19_1 = arith.constant false + // inactive merged from vshrs_shift_boundary_kernel + scf.if %__deep_merge_guard_cmg19_1 { + + %c0_m0_cmg19_1 = arith.constant 0 : index + %c128_m0_cmg19_1 = arith.constant 128 : index + %c1024_m0_cmg19_1 = arith.constant 1024 : index + %c0_i64_m0_cmg19_1 = arith.constant 0 : i64 + %c1_i64_m0_cmg19_1 = arith.constant 1 : i64 + %c32_i64_m0_cmg19_1 = arith.constant 32 : i64 + %c64_i64_m0_cmg19_1 = arith.constant 64 : i64 + %c2048_i64_m0_cmg19_1 = arith.constant 2048 : i64 + %scalar_m0_cmg19_1 = arith.constant 15 : i16 + %ub_in_m0_cmg19_1 = pto.castptr %c0_i64_m0_cmg19_1 : i64 -> !pto.ptr + %ub_out_m0_cmg19_1 = pto.castptr %c2048_i64_m0_cmg19_1 : i64 -> !pto.ptr + + %false_m0_cmg19_1 = arith.constant false + pto.mte_gm_ub %arg0, %ub_in_m0_cmg19_1, %c0_i64_m0_cmg19_1, %c64_i64_m0_cmg19_1 + nburst(%c32_i64_m0_cmg19_1, %c64_i64_m0_cmg19_1, %c64_i64_m0_cmg19_1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %mask_m0_cmg19_1 = pto.pset_b16 "PAT_ALL" : !pto.mask + scf.for %offset_m0_cmg19_1 = %c0_m0_cmg19_1 to %c1024_m0_cmg19_1 step %c128_m0_cmg19_1 { + %vec_m0_cmg19_1 = pto.vlds %ub_in_m0_cmg19_1[%offset_m0_cmg19_1] : !pto.ptr -> !pto.vreg<128xui16> + %shifted_m0_cmg19_1 = pto.vshrs %vec_m0_cmg19_1, %scalar_m0_cmg19_1, %mask_m0_cmg19_1 : !pto.vreg<128xui16>, i16, !pto.mask -> !pto.vreg<128xui16> + pto.vsts %shifted_m0_cmg19_1, %ub_out_m0_cmg19_1[%offset_m0_cmg19_1], %mask_m0_cmg19_1 : !pto.vreg<128xui16>, !pto.ptr, !pto.mask + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m0_cmg19_1, %arg1, %c64_i64_m0_cmg19_1 + nburst(%c32_i64_m0_cmg19_1, %c64_i64_m0_cmg19_1, %c64_i64_m0_cmg19_1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + + } + // active merged from vshrs_kernel + + %c0_m1_cmg19_1 = arith.constant 0 : index + %c128_m1_cmg19_1 = arith.constant 128 : index + %c1024_m1_cmg19_1 = arith.constant 1024 : index + %c0_i64_m1_cmg19_1 = arith.constant 0 : i64 + %c1_i64_m1_cmg19_1 = arith.constant 1 : i64 + %c32_i64_m1_cmg19_1 = arith.constant 32 : i64 + %c64_i64_m1_cmg19_1 = arith.constant 64 : i64 + %c2048_i64_m1_cmg19_1 = arith.constant 2048 : i64 + %scalar_m1_cmg19_1 = arith.constant 3 : i16 + %ub_in_m1_cmg19_1 = pto.castptr %c0_i64_m1_cmg19_1 : i64 -> !pto.ptr + %ub_out_m1_cmg19_1 = pto.castptr %c2048_i64_m1_cmg19_1 : i64 -> !pto.ptr + + %false_m1_cmg19_1 = arith.constant false + pto.mte_gm_ub %arg2, %ub_in_m1_cmg19_1, %c0_i64_m1_cmg19_1, %c64_i64_m1_cmg19_1 + nburst(%c32_i64_m1_cmg19_1, %c64_i64_m1_cmg19_1, %c64_i64_m1_cmg19_1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %mask_m1_cmg19_1 = pto.pset_b16 "PAT_ALL" : !pto.mask + scf.for %offset_m1_cmg19_1 = %c0_m1_cmg19_1 to %c1024_m1_cmg19_1 step %c128_m1_cmg19_1 { + %vec_m1_cmg19_1 = pto.vlds %ub_in_m1_cmg19_1[%offset_m1_cmg19_1] : !pto.ptr -> !pto.vreg<128xui16> + %shifted_m1_cmg19_1 = pto.vshrs %vec_m1_cmg19_1, %scalar_m1_cmg19_1, %mask_m1_cmg19_1 : !pto.vreg<128xui16>, i16, !pto.mask -> !pto.vreg<128xui16> + pto.vsts %shifted_m1_cmg19_1, %ub_out_m1_cmg19_1[%offset_m1_cmg19_1], %mask_m1_cmg19_1 : !pto.vreg<128xui16>, !pto.ptr, !pto.mask + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m1_cmg19_1, %arg3, %c64_i64_m1_cmg19_1 + nburst(%c32_i64_m1_cmg19_1, %c64_i64_m1_cmg19_1, %c64_i64_m1_cmg19_1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + } + return } } diff --git a/test/vpto/cases/micro-op/vec-scalar/vshls/launch.cpp b/test/vpto/cases/micro-op/vec-scalar/vshls/launch.cpp index ed048246e..4427374e1 100644 --- a/test/vpto/cases/micro-op/vec-scalar/vshls/launch.cpp +++ b/test/vpto/cases/micro-op/vec-scalar/vshls/launch.cpp @@ -5,19 +5,9 @@ // THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, // INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. // See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/vec-scalar/vshls -// family: vec-scalar -// target_ops: pto.vshls -// scenarios: core-i16-unsigned, full-mask, scalar-operand -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- #ifndef __VEC_SCOPE__ #define __VEC_SCOPE__ #endif - #if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) typedef struct { unsigned char v; } hifloat8_t; typedef struct { unsigned char v; } float8_e4m3_t; @@ -30,22 +20,23 @@ typedef struct { unsigned char v; } float4_e2m1x2_t; #if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) #include #endif - #if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; +struct MrgSortExecutedNumList { uint16_t mrgSortList0,mrgSortList1,mrgSortList2,mrgSortList3; }; #endif #ifndef __CPU_SIM #include "acl/acl.h" #endif -extern "C" __global__ [aicore] void vshls_kernel(__gm__ float *v1, - __gm__ float *v2); +extern "C" __global__ [aicore] void vshls_deep_merged_kernel( + __gm__ uint16_t * arg0, + __gm__ uint16_t * arg1, + __gm__ uint16_t * arg2, + __gm__ uint16_t * arg3); -void LaunchVshls_kernel(float *v1, float *v2, void *stream) { - vshls_kernel<<<1, nullptr, stream>>>((__gm__ float *)v1, (__gm__ float *)v2); +void LaunchVshlsDeepMerged(float * p0, float * p1, void *stream) { + vshls_deep_merged_kernel<<<1, nullptr, stream>>>( + (__gm__ uint16_t *)p0, + (__gm__ uint16_t *)p0, + (__gm__ uint16_t *)p0, + (__gm__ uint16_t *)p1); } diff --git a/test/vpto/cases/micro-op/vec-scalar/vshls/main.cpp b/test/vpto/cases/micro-op/vec-scalar/vshls/main.cpp index f5cec4212..96a0d09d9 100644 --- a/test/vpto/cases/micro-op/vec-scalar/vshls/main.cpp +++ b/test/vpto/cases/micro-op/vec-scalar/vshls/main.cpp @@ -36,8 +36,8 @@ using namespace PtoTestCommon; } \ } while (0) -void LaunchVshls_kernel(float *v1, float *v2, void *stream); +void LaunchVshlsDeepMerged(float * p0, float * p1, void *stream); int main() { size_t elemCount_v1 = 1024; size_t fileSize_v1 = elemCount_v1 * sizeof(float); @@ -70,7 +70,7 @@ int main() { ACL_MEMCPY_HOST_TO_DEVICE)); ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVshls_kernel(v1Device, v2Device, stream); + LaunchVshlsDeepMerged(v1Device, v2Device, stream); ACL_CHECK(aclrtSynchronizeStream(stream)); ACL_CHECK(aclrtMemcpy(v2Host, fileSize_v2, v2Device, fileSize_v2, ACL_MEMCPY_DEVICE_TO_HOST)); diff --git a/test/vpto/cases/micro-op/vec-scalar/vshrs-shift-boundary/compare.py b/test/vpto/cases/micro-op/vec-scalar/vshrs-shift-boundary/compare.py deleted file mode 100644 index 65d6e8920..000000000 --- a/test/vpto/cases/micro-op/vec-scalar/vshrs-shift-boundary/compare.py +++ /dev/null @@ -1,41 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/vec-scalar/vshrs-shift-boundary -# family: vec-scalar -# target_ops: pto.vshrs -# scenarios: core-i16-unsigned, full-mask, scalar-operand, shift-boundary - -import os -import sys -import numpy as np - - -def compare_bin(golden_path, output_path, dtype): - if not os.path.exists(golden_path) or not os.path.exists(output_path): - return False - golden = np.fromfile(golden_path, dtype=dtype) - output = np.fromfile(output_path, dtype=dtype) - return golden.shape == output.shape and np.array_equal(golden, output) - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_bin("golden_v2.bin", "v2.bin", np.uint16) - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/vec-scalar/vshrs-shift-boundary/golden.py b/test/vpto/cases/micro-op/vec-scalar/vshrs-shift-boundary/golden.py deleted file mode 100644 index c1f36dae0..000000000 --- a/test/vpto/cases/micro-op/vec-scalar/vshrs-shift-boundary/golden.py +++ /dev/null @@ -1,51 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/vec-scalar/vshrs-shift-boundary -# family: vec-scalar -# target_ops: pto.vshrs -# scenarios: core-i16-unsigned, full-mask, scalar-operand, shift-boundary - -import argparse -from pathlib import Path - -import numpy as np - - -ELEMS = 1024 -SHIFT = 15 -PATTERN = np.array( - [0x0000, 0x0001, 0x0002, 0x0003, 0x7FFF, 0x8000, 0x8001, 0xFFFF], - dtype=np.uint16, -) - - -def generate(output_dir: Path, seed: int) -> None: - del seed - repeats = ELEMS // PATTERN.size - v1 = np.tile(PATTERN, repeats) - v2 = np.zeros(ELEMS, dtype=np.uint16) - golden_v2 = np.right_shift(v1, SHIFT) - - output_dir.mkdir(parents=True, exist_ok=True) - v1.tofile(output_dir / "v1.bin") - v2.tofile(output_dir / "v2.bin") - golden_v2.tofile(output_dir / "golden_v2.bin") - - -def main() -> None: - parser = argparse.ArgumentParser() - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=19) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/vec-scalar/vshrs-shift-boundary/kernel.pto b/test/vpto/cases/micro-op/vec-scalar/vshrs-shift-boundary/kernel.pto deleted file mode 100644 index c122287fc..000000000 --- a/test/vpto/cases/micro-op/vec-scalar/vshrs-shift-boundary/kernel.pto +++ /dev/null @@ -1,47 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/vec-scalar/vshrs-shift-boundary -// family: vec-scalar -// target_ops: pto.vshrs -// scenarios: core-i16-unsigned, full-mask, scalar-operand, shift-boundary -// ----------------------------------------------------------------------------- -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vshrs_shift_boundary_kernel(%arg0: !pto.ptr, - %arg1: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c128 = arith.constant 128 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c64_i64 = arith.constant 64 : i64 - %c2048_i64 = arith.constant 2048 : i64 - %scalar = arith.constant 15 : i16 - %ub_in = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c2048_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_in, %c0_i64, %c64_i64 - nburst(%c32_i64, %c64_i64, %c64_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %mask = pto.pset_b16 "PAT_ALL" : !pto.mask - scf.for %offset = %c0 to %c1024 step %c128 { - %vec = pto.vlds %ub_in[%offset] : !pto.ptr -> !pto.vreg<128xui16> - %shifted = pto.vshrs %vec, %scalar, %mask : !pto.vreg<128xui16>, i16, !pto.mask -> !pto.vreg<128xui16> - pto.vsts %shifted, %ub_out[%offset], %mask : !pto.vreg<128xui16>, !pto.ptr, !pto.mask - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg1, %c64_i64 - nburst(%c32_i64, %c64_i64, %c64_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/vec-scalar/vshrs-shift-boundary/launch.cpp b/test/vpto/cases/micro-op/vec-scalar/vshrs-shift-boundary/launch.cpp deleted file mode 100644 index b108e4ba5..000000000 --- a/test/vpto/cases/micro-op/vec-scalar/vshrs-shift-boundary/launch.cpp +++ /dev/null @@ -1,52 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/vec-scalar/vshrs-shift-boundary -// family: vec-scalar -// target_ops: pto.vshrs -// scenarios: core-i16-unsigned, full-mask, scalar-operand -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vshrs_shift_boundary_kernel(__gm__ float *v1, - __gm__ float *v2); - -void LaunchVshrs_shift_boundary_kernel(float *v1, float *v2, void *stream) { - vshrs_shift_boundary_kernel<<<1, nullptr, stream>>>((__gm__ float *)v1, - (__gm__ float *)v2); -} diff --git a/test/vpto/cases/micro-op/vec-scalar/vshrs-shift-boundary/main.cpp b/test/vpto/cases/micro-op/vec-scalar/vshrs-shift-boundary/main.cpp deleted file mode 100644 index 4f5611378..000000000 --- a/test/vpto/cases/micro-op/vec-scalar/vshrs-shift-boundary/main.cpp +++ /dev/null @@ -1,91 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/vec-scalar/vshrs-shift-boundary -// family: vec-scalar -// target_ops: pto.vshrs -// scenarios: core-i16-unsigned, full-mask, scalar-operand -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include - -using namespace PtoTestCommon; - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, \ - (int)_ret, __FILE__, __LINE__); \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVshrs_shift_boundary_kernel(float *v1, float *v2, void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(float); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(float); - float *v1Host = nullptr; - float *v1Device = nullptr; - float *v2Host = nullptr; - float *v2Device = nullptr; - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) - deviceId = std::atoi(envDevice); - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, - ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVshrs_shift_boundary_kernel(v1Device, v2Device, stream); - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v2Host, fileSize_v2, v2Device, fileSize_v2, - ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v2.bin", v2Host, fileSize_v2); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - if (stream != nullptr) - aclrtDestroyStream(stream); - if (deviceSet) - aclrtResetDevice(deviceId); - if (aclInited) - aclFinalize(); - return rc; -} diff --git a/test/vpto/cases/micro-op/vec-scalar/vshrs/compare.py b/test/vpto/cases/micro-op/vec-scalar/vshrs/compare.py deleted file mode 100644 index 3c2384aff..000000000 --- a/test/vpto/cases/micro-op/vec-scalar/vshrs/compare.py +++ /dev/null @@ -1,41 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/vec-scalar/vshrs -# family: vec-scalar -# target_ops: pto.vshrs -# scenarios: core-i16-unsigned, full-mask, scalar-operand - -import os -import sys -import numpy as np - - -def compare_bin(golden_path, output_path, dtype): - if not os.path.exists(golden_path) or not os.path.exists(output_path): - return False - golden = np.fromfile(golden_path, dtype=dtype) - output = np.fromfile(output_path, dtype=dtype) - return golden.shape == output.shape and np.array_equal(golden, output) - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_bin("golden_v2.bin", "v2.bin", np.uint16) - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/vec-scalar/vshrs/golden.py b/test/vpto/cases/micro-op/vec-scalar/vshrs/golden.py deleted file mode 100644 index 82b2a9e07..000000000 --- a/test/vpto/cases/micro-op/vec-scalar/vshrs/golden.py +++ /dev/null @@ -1,47 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/vec-scalar/vshrs -# family: vec-scalar -# target_ops: pto.vshrs -# scenarios: core-i16-unsigned, full-mask, scalar-operand - -import argparse -from pathlib import Path - -import numpy as np - - -ELEMS = 1024 -SEED = 19 -SHIFT = 3 - - -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) - v1 = rng.integers(0, np.iinfo(np.uint16).max + 1, size=ELEMS, dtype=np.uint16) - v2 = np.zeros(ELEMS, dtype=np.uint16) - golden_v2 = np.right_shift(v1, SHIFT) - - output_dir.mkdir(parents=True, exist_ok=True) - v1.tofile(output_dir / "v1.bin") - v2.tofile(output_dir / "v2.bin") - golden_v2.tofile(output_dir / "golden_v2.bin") - - -def main() -> None: - parser = argparse.ArgumentParser() - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/vec-scalar/vshrs/kernel.pto b/test/vpto/cases/micro-op/vec-scalar/vshrs/kernel.pto deleted file mode 100644 index 0c01f3047..000000000 --- a/test/vpto/cases/micro-op/vec-scalar/vshrs/kernel.pto +++ /dev/null @@ -1,47 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/vec-scalar/vshrs -// family: vec-scalar -// target_ops: pto.vshrs -// scenarios: core-i16-unsigned, full-mask, scalar-operand -// ----------------------------------------------------------------------------- -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vshrs_kernel(%arg0: !pto.ptr, - %arg1: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c128 = arith.constant 128 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c64_i64 = arith.constant 64 : i64 - %c2048_i64 = arith.constant 2048 : i64 - %scalar = arith.constant 3 : i16 - %ub_in = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c2048_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_in, %c0_i64, %c64_i64 - nburst(%c32_i64, %c64_i64, %c64_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %mask = pto.pset_b16 "PAT_ALL" : !pto.mask - scf.for %offset = %c0 to %c1024 step %c128 { - %vec = pto.vlds %ub_in[%offset] : !pto.ptr -> !pto.vreg<128xui16> - %shifted = pto.vshrs %vec, %scalar, %mask : !pto.vreg<128xui16>, i16, !pto.mask -> !pto.vreg<128xui16> - pto.vsts %shifted, %ub_out[%offset], %mask : !pto.vreg<128xui16>, !pto.ptr, !pto.mask - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg1, %c64_i64 - nburst(%c32_i64, %c64_i64, %c64_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/vec-scalar/vshrs/launch.cpp b/test/vpto/cases/micro-op/vec-scalar/vshrs/launch.cpp deleted file mode 100644 index ebf9902d1..000000000 --- a/test/vpto/cases/micro-op/vec-scalar/vshrs/launch.cpp +++ /dev/null @@ -1,51 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/vec-scalar/vshrs -// family: vec-scalar -// target_ops: pto.vshrs -// scenarios: core-i16-unsigned, full-mask, scalar-operand -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vshrs_kernel(__gm__ float *v1, - __gm__ float *v2); - -void LaunchVshrs_kernel(float *v1, float *v2, void *stream) { - vshrs_kernel<<<1, nullptr, stream>>>((__gm__ float *)v1, (__gm__ float *)v2); -} diff --git a/test/vpto/cases/micro-op/vec-scalar/vshrs/main.cpp b/test/vpto/cases/micro-op/vec-scalar/vshrs/main.cpp deleted file mode 100644 index 81790da59..000000000 --- a/test/vpto/cases/micro-op/vec-scalar/vshrs/main.cpp +++ /dev/null @@ -1,91 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/vec-scalar/vshrs -// family: vec-scalar -// target_ops: pto.vshrs -// scenarios: core-i16-unsigned, full-mask, scalar-operand -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include - -using namespace PtoTestCommon; - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, \ - (int)_ret, __FILE__, __LINE__); \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVshrs_kernel(float *v1, float *v2, void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(float); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(float); - float *v1Host = nullptr; - float *v1Device = nullptr; - float *v2Host = nullptr; - float *v2Device = nullptr; - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) - deviceId = std::atoi(envDevice); - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, - ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVshrs_kernel(v1Device, v2Device, stream); - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v2Host, fileSize_v2, v2Device, fileSize_v2, - ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v2.bin", v2Host, fileSize_v2); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - if (stream != nullptr) - aclrtDestroyStream(stream); - if (deviceSet) - aclrtResetDevice(deviceId); - if (aclInited) - aclFinalize(); - return rc; -} diff --git a/test/vpto/cases/micro-op/vec-scalar/vsubcs-borrow-boundary/compare.py b/test/vpto/cases/micro-op/vec-scalar/vsubcs-borrow-boundary/compare.py deleted file mode 100644 index 87847b721..000000000 --- a/test/vpto/cases/micro-op/vec-scalar/vsubcs-borrow-boundary/compare.py +++ /dev/null @@ -1,64 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/vec-scalar/vsubcs-borrow-boundary -# family: vec-scalar -# target_ops: pto.vsubcs -# scenarios: core-u32-unsigned, full-mask, carry-chain, integer-overflow - -import os -import sys -import numpy as np - - -REPEAT_BYTES = 256 -LOGICAL_ELEMS = 64 -SRC_ELEM_BYTES = 4 - - -def _ceil_div(x, y): - return (x + y - 1) // y - - -def _packed_pred_storage_bytes(logical_elems, src_elem_bytes): - repeat_elems = REPEAT_BYTES // src_elem_bytes - repeat_times = _ceil_div(logical_elems, repeat_elems) + 1 - loop_count = repeat_times // 2 - return loop_count * 16 - - -def compare_result(): - golden = np.fromfile("golden_v3.bin", dtype=np.uint32, count=64) - output = np.fromfile("v3.bin", dtype=np.uint32, count=64) - return golden.shape == output.shape and np.array_equal(golden, output) - - -def compare_borrow(): - prefix_bytes = _packed_pred_storage_bytes(LOGICAL_ELEMS, SRC_ELEM_BYTES) - golden = np.fromfile("golden_v4.bin", dtype=np.uint8) - output = np.fromfile("v4.bin", dtype=np.uint8) - if golden.size < prefix_bytes or output.size < prefix_bytes: - return False - return np.array_equal(golden[:prefix_bytes], output[:prefix_bytes]) - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_result() and compare_borrow() - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/vec-scalar/vsubcs-borrow-boundary/golden.py b/test/vpto/cases/micro-op/vec-scalar/vsubcs-borrow-boundary/golden.py deleted file mode 100644 index d20ebafc3..000000000 --- a/test/vpto/cases/micro-op/vec-scalar/vsubcs-borrow-boundary/golden.py +++ /dev/null @@ -1,73 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/vec-scalar/vsubcs-borrow-boundary -# family: vec-scalar -# target_ops: pto.vsubcs -# scenarios: core-u32-unsigned, full-mask, carry-chain, integer-overflow - -import argparse -from pathlib import Path - -import numpy as np - - -LANES = 64 -LHS_PATTERN = np.array( - [0x00000000, 0x00000001, 0x00000000, 0xFFFFFFFF, 0x80000000, 0x7FFFFFFF, 0xAAAAAAAA, 0x55555555], - dtype=np.uint32, -) -RHS_PATTERN = np.array( - [0x00000000, 0x00000000, 0x00000001, 0xFFFFFFFF, 0x7FFFFFFF, 0x80000000, 0x55555555, 0xAAAAAAAA], - dtype=np.uint32, -) - - -def pack_mask_nibbles(bits): - out = np.zeros(256, dtype=np.uint8) - for idx, bit in enumerate(bits): - if not bit: - continue - byte = idx // 2 - if idx % 2 == 0: - out[byte] |= np.uint8(0x1) - else: - out[byte] |= np.uint8(0x10) - return out - - -def generate(output_dir: Path, seed: int) -> None: - del seed - repeats = LANES // LHS_PATTERN.size - lhs = np.tile(LHS_PATTERN, repeats) - rhs = np.tile(RHS_PATTERN, repeats) - lhs64 = lhs.astype(np.uint64) - rhs64 = rhs.astype(np.uint64) - no_borrow = lhs64 >= rhs64 - result = ((lhs64 - rhs64) & np.uint64(0xFFFFFFFF)).astype(np.uint32) - - output_dir.mkdir(parents=True, exist_ok=True) - lhs.tofile(output_dir / "v1.bin") - rhs.tofile(output_dir / "v2.bin") - np.zeros(LANES, dtype=np.uint32).tofile(output_dir / "v3.bin") - np.zeros(256, dtype=np.uint8).tofile(output_dir / "v4.bin") - result.tofile(output_dir / "golden_v3.bin") - pack_mask_nibbles(no_borrow).tofile(output_dir / "golden_v4.bin") - - -def main() -> None: - parser = argparse.ArgumentParser() - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=19) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/vec-scalar/vsubcs-borrow-boundary/kernel.pto b/test/vpto/cases/micro-op/vec-scalar/vsubcs-borrow-boundary/kernel.pto deleted file mode 100644 index 5ef0438fe..000000000 --- a/test/vpto/cases/micro-op/vec-scalar/vsubcs-borrow-boundary/kernel.pto +++ /dev/null @@ -1,57 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/vec-scalar/vsubcs-borrow-boundary -// family: vec-scalar -// target_ops: pto.vsubcs -// scenarios: core-u32-unsigned, full-mask, carry-chain, integer-overflow -// ----------------------------------------------------------------------------- -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vsubcs_borrow_boundary_kernel(%arg0: !pto.ptr, - %arg1: !pto.ptr, - %arg2: !pto.ptr, - %arg3: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c128_i64 = arith.constant 128 : i64 - %c256_i64 = arith.constant 256 : i64 - %c4096_i64 = arith.constant 4096 : i64 - %c8192_i64 = arith.constant 8192 : i64 - %c12288_i64 = arith.constant 12288 : i64 - %false = arith.constant false - - %ub_lhs = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_rhs = pto.castptr %c4096_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c8192_i64 : i64 -> !pto.ptr - %ub_borrow = pto.castptr %c12288_i64 : i64 -> !pto.ptr - pto.mte_gm_ub %arg0, %ub_lhs, %c0_i64, %c256_i64 - nburst(%c1_i64, %c256_i64, %c256_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - pto.mte_gm_ub %arg1, %ub_rhs, %c0_i64, %c256_i64 - nburst(%c1_i64, %c256_i64, %c256_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %mask = pto.pset_b32 "PAT_ALL" : !pto.mask - %borrow_in = pto.pset_b32 "PAT_ALL" : !pto.mask - %lhs = pto.vlds %ub_lhs[%c0] : !pto.ptr -> !pto.vreg<64xui32> - %rhs = pto.vlds %ub_rhs[%c0] : !pto.ptr -> !pto.vreg<64xui32> - %diff, %borrow = pto.vsubcs %lhs, %rhs, %borrow_in, %mask : !pto.vreg<64xui32>, !pto.vreg<64xui32>, !pto.mask, !pto.mask -> !pto.vreg<64xui32>, !pto.mask - pto.vsts %diff, %ub_out[%c0], %mask : !pto.vreg<64xui32>, !pto.ptr, !pto.mask - pto.psti %borrow, %ub_borrow[%c0], "NORM" : !pto.mask, !pto.ptr, index - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg2, %c256_i64 - nburst(%c1_i64, %c256_i64, %c256_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.mte_ub_gm %ub_borrow, %arg3, %c128_i64 - nburst(%c1_i64, %c256_i64, %c256_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/vec-scalar/vsubcs-borrow-boundary/launch.cpp b/test/vpto/cases/micro-op/vec-scalar/vsubcs-borrow-boundary/launch.cpp deleted file mode 100644 index a1cb56e2e..000000000 --- a/test/vpto/cases/micro-op/vec-scalar/vsubcs-borrow-boundary/launch.cpp +++ /dev/null @@ -1,53 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/vec-scalar/vsubcs-borrow-boundary -// family: vec-scalar -// target_ops: pto.vsubcs -// scenarios: core-u32-unsigned, full-mask, carry-chain, integer-overflow -// ----------------------------------------------------------------------------- -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vsubcs_borrow_boundary_kernel( - __gm__ uint32_t *v1, __gm__ uint32_t *v2, __gm__ uint32_t *v3, - __gm__ uint8_t *v4); - -void LaunchVsubcsBorrowBoundaryKernel(uint32_t *v1, uint32_t *v2, uint32_t *v3, - uint8_t *v4, void *stream) { - vsubcs_borrow_boundary_kernel<<<1, nullptr, stream>>>( - (__gm__ uint32_t *)v1, (__gm__ uint32_t *)v2, (__gm__ uint32_t *)v3, - (__gm__ uint8_t *)v4); -} diff --git a/test/vpto/cases/micro-op/vec-scalar/vsubcs-borrow-boundary/main.cpp b/test/vpto/cases/micro-op/vec-scalar/vsubcs-borrow-boundary/main.cpp deleted file mode 100644 index 169bc4512..000000000 --- a/test/vpto/cases/micro-op/vec-scalar/vsubcs-borrow-boundary/main.cpp +++ /dev/null @@ -1,116 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/vec-scalar/vsubcs-borrow-boundary -// family: vec-scalar -// target_ops: pto.vsubcs -// scenarios: core-u32-unsigned, full-mask, carry-chain, integer-overflow -// ----------------------------------------------------------------------------- -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include - -using namespace PtoTestCommon; - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, \ - (int)_ret, __FILE__, __LINE__); \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVsubcsBorrowBoundaryKernel(uint32_t *v1, uint32_t *v2, uint32_t *v3, - uint8_t *v4, void *stream); - -int main() { - size_t elemCount_v1 = 64; - size_t fileSize_v1 = elemCount_v1 * sizeof(uint32_t); - size_t elemCount_v2 = 64; - size_t fileSize_v2 = elemCount_v2 * sizeof(uint32_t); - size_t elemCount_v3 = 64; - size_t fileSize_v3 = elemCount_v3 * sizeof(uint32_t); - size_t elemCount_v4 = 256; - size_t fileSize_v4 = elemCount_v4 * sizeof(uint8_t); - uint32_t *v1Host = nullptr; - uint32_t *v1Device = nullptr; - uint32_t *v2Host = nullptr; - uint32_t *v2Device = nullptr; - uint32_t *v3Host = nullptr; - uint32_t *v3Device = nullptr; - uint8_t *v4Host = nullptr; - uint8_t *v4Device = nullptr; - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) - deviceId = std::atoi(envDevice); - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMallocHost((void **)(&v3Host), fileSize_v3)); - ACL_CHECK(aclrtMallocHost((void **)(&v4Host), fileSize_v4)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v3Device, fileSize_v3, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v4Device, fileSize_v4, ACL_MEM_MALLOC_HUGE_FIRST)); - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ReadFile("./v3.bin", fileSize_v3, v3Host, fileSize_v3); - ReadFile("./v4.bin", fileSize_v4, v4Host, fileSize_v4); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v3Device, fileSize_v3, v3Host, fileSize_v3, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v4Device, fileSize_v4, v4Host, fileSize_v4, - ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVsubcsBorrowBoundaryKernel(v1Device, v2Device, v3Device, v4Device, - stream); - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v3Host, fileSize_v3, v3Device, fileSize_v3, - ACL_MEMCPY_DEVICE_TO_HOST)); - ACL_CHECK(aclrtMemcpy(v4Host, fileSize_v4, v4Device, fileSize_v4, - ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v3.bin", v3Host, fileSize_v3); - WriteFile("./v4.bin", v4Host, fileSize_v4); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFree(v3Device); - aclrtFree(v4Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - aclrtFreeHost(v3Host); - aclrtFreeHost(v4Host); - if (stream != nullptr) - aclrtDestroyStream(stream); - if (deviceSet) - aclrtResetDevice(deviceId); - if (aclInited) - aclFinalize(); - return rc; -} diff --git a/test/vpto/cases/micro-op/vec-scalar/vsubcs/compare.py b/test/vpto/cases/micro-op/vec-scalar/vsubcs/compare.py deleted file mode 100644 index 047f6c245..000000000 --- a/test/vpto/cases/micro-op/vec-scalar/vsubcs/compare.py +++ /dev/null @@ -1,64 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/vec-scalar/vsubcs -# family: vec-scalar -# target_ops: pto.vsubcs -# scenarios: core-u32-unsigned, full-mask, carry-chain - -import os -import sys -import numpy as np - - -REPEAT_BYTES = 256 -LOGICAL_ELEMS = 64 -SRC_ELEM_BYTES = 4 - - -def _ceil_div(x, y): - return (x + y - 1) // y - - -def _packed_pred_storage_bytes(logical_elems, src_elem_bytes): - repeat_elems = REPEAT_BYTES // src_elem_bytes - repeat_times = _ceil_div(logical_elems, repeat_elems) + 1 - loop_count = repeat_times // 2 - return loop_count * 16 - - -def compare_result(): - golden = np.fromfile("golden_v3.bin", dtype=np.uint32, count=64) - output = np.fromfile("v3.bin", dtype=np.uint32, count=64) - return golden.shape == output.shape and np.array_equal(golden, output) - - -def compare_borrow(): - prefix_bytes = _packed_pred_storage_bytes(LOGICAL_ELEMS, SRC_ELEM_BYTES) - golden = np.fromfile("golden_v4.bin", dtype=np.uint8) - output = np.fromfile("v4.bin", dtype=np.uint8) - if golden.size < prefix_bytes or output.size < prefix_bytes: - return False - return np.array_equal(golden[:prefix_bytes], output[:prefix_bytes]) - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_result() and compare_borrow() - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/vec-scalar/vsubcs/golden.py b/test/vpto/cases/micro-op/vec-scalar/vsubcs/golden.py deleted file mode 100644 index d9c1f2e8b..000000000 --- a/test/vpto/cases/micro-op/vec-scalar/vsubcs/golden.py +++ /dev/null @@ -1,65 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/vec-scalar/vsubcs -# family: vec-scalar -# target_ops: pto.vsubcs -# scenarios: core-u32-unsigned, full-mask, carry-chain - -import argparse -from pathlib import Path - -import numpy as np - - -LANES = 64 -SEED = 19 - - -def pack_mask_nibbles(bits): - out = np.zeros(256, dtype=np.uint8) - for idx, bit in enumerate(bits): - if not bit: - continue - byte = idx // 2 - if idx % 2 == 0: - out[byte] |= np.uint8(0x1) - else: - out[byte] |= np.uint8(0x10) - return out - - -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) - lhs = rng.integers(0, 0xFFFFFFFF, size=LANES, dtype=np.uint32) - rhs = rng.integers(0, 0xFFFFFFFF, size=LANES, dtype=np.uint32) - lhs64 = lhs.astype(np.uint64) - rhs64 = rhs.astype(np.uint64) - no_borrow = lhs64 >= rhs64 - result = ((lhs64 - rhs64) & np.uint64(0xFFFFFFFF)).astype(np.uint32) - - output_dir.mkdir(parents=True, exist_ok=True) - lhs.tofile(output_dir / "v1.bin") - rhs.tofile(output_dir / "v2.bin") - np.zeros(LANES, dtype=np.uint32).tofile(output_dir / "v3.bin") - np.zeros(256, dtype=np.uint8).tofile(output_dir / "v4.bin") - result.tofile(output_dir / "golden_v3.bin") - pack_mask_nibbles(no_borrow).tofile(output_dir / "golden_v4.bin") - - -def main() -> None: - parser = argparse.ArgumentParser() - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/vec-scalar/vsubcs/kernel.pto b/test/vpto/cases/micro-op/vec-scalar/vsubcs/kernel.pto deleted file mode 100644 index a7d9e2962..000000000 --- a/test/vpto/cases/micro-op/vec-scalar/vsubcs/kernel.pto +++ /dev/null @@ -1,55 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/vec-scalar/vsubcs -// family: vec-scalar -// target_ops: pto.vsubcs -// scenarios: core-u32-unsigned, full-mask, carry-chain -// ----------------------------------------------------------------------------- -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vsubcs_kernel(%arg0: !pto.ptr, %arg1: !pto.ptr, - %arg2: !pto.ptr, %arg3: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c128_i64 = arith.constant 128 : i64 - %c256_i64 = arith.constant 256 : i64 - %c4096_i64 = arith.constant 4096 : i64 - %c8192_i64 = arith.constant 8192 : i64 - %c12288_i64 = arith.constant 12288 : i64 - %false = arith.constant false - - %ub_lhs = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_rhs = pto.castptr %c4096_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c8192_i64 : i64 -> !pto.ptr - %ub_borrow = pto.castptr %c12288_i64 : i64 -> !pto.ptr - pto.mte_gm_ub %arg0, %ub_lhs, %c0_i64, %c256_i64 - nburst(%c1_i64, %c256_i64, %c256_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - pto.mte_gm_ub %arg1, %ub_rhs, %c0_i64, %c256_i64 - nburst(%c1_i64, %c256_i64, %c256_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %mask = pto.pset_b32 "PAT_ALL" : !pto.mask - %borrow_in = pto.pset_b32 "PAT_ALL" : !pto.mask - %lhs = pto.vlds %ub_lhs[%c0] : !pto.ptr -> !pto.vreg<64xui32> - %rhs = pto.vlds %ub_rhs[%c0] : !pto.ptr -> !pto.vreg<64xui32> - %diff, %borrow = pto.vsubcs %lhs, %rhs, %borrow_in, %mask : !pto.vreg<64xui32>, !pto.vreg<64xui32>, !pto.mask, !pto.mask -> !pto.vreg<64xui32>, !pto.mask - pto.vsts %diff, %ub_out[%c0], %mask : !pto.vreg<64xui32>, !pto.ptr, !pto.mask - pto.psti %borrow, %ub_borrow[%c0], "NORM" : !pto.mask, !pto.ptr, index - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg2, %c256_i64 - nburst(%c1_i64, %c256_i64, %c256_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.mte_ub_gm %ub_borrow, %arg3, %c128_i64 - nburst(%c1_i64, %c256_i64, %c256_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/vec-scalar/vsubcs/launch.cpp b/test/vpto/cases/micro-op/vec-scalar/vsubcs/launch.cpp deleted file mode 100644 index 534b84ab1..000000000 --- a/test/vpto/cases/micro-op/vec-scalar/vsubcs/launch.cpp +++ /dev/null @@ -1,54 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/vec-scalar/vsubcs -// family: vec-scalar -// target_ops: pto.vsubcs -// scenarios: core-u32-unsigned, full-mask, carry-chain -// ----------------------------------------------------------------------------- -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void -vsubcs_kernel(__gm__ uint32_t *v1, __gm__ uint32_t *v2, __gm__ uint32_t *v3, - __gm__ uint8_t *v4); - -void LaunchVsubcs_kernel(uint32_t *v1, uint32_t *v2, uint32_t *v3, uint8_t *v4, - void *stream) { - vsubcs_kernel<<<1, nullptr, stream>>>((__gm__ uint32_t *)v1, - (__gm__ uint32_t *)v2, - (__gm__ uint32_t *)v3, - (__gm__ uint8_t *)v4); -} diff --git a/test/vpto/cases/micro-op/vec-scalar/vsubcs/main.cpp b/test/vpto/cases/micro-op/vec-scalar/vsubcs/main.cpp deleted file mode 100644 index 5bcad0fcc..000000000 --- a/test/vpto/cases/micro-op/vec-scalar/vsubcs/main.cpp +++ /dev/null @@ -1,115 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/vec-scalar/vsubcs -// family: vec-scalar -// target_ops: pto.vsubcs -// scenarios: core-u32-unsigned, full-mask, carry-chain -// ----------------------------------------------------------------------------- -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include - -using namespace PtoTestCommon; - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, \ - (int)_ret, __FILE__, __LINE__); \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVsubcs_kernel(uint32_t *v1, uint32_t *v2, uint32_t *v3, uint8_t *v4, - void *stream); - -int main() { - size_t elemCount_v1 = 64; - size_t fileSize_v1 = elemCount_v1 * sizeof(uint32_t); - size_t elemCount_v2 = 64; - size_t fileSize_v2 = elemCount_v2 * sizeof(uint32_t); - size_t elemCount_v3 = 64; - size_t fileSize_v3 = elemCount_v3 * sizeof(uint32_t); - size_t elemCount_v4 = 256; - size_t fileSize_v4 = elemCount_v4 * sizeof(uint8_t); - uint32_t *v1Host = nullptr; - uint32_t *v1Device = nullptr; - uint32_t *v2Host = nullptr; - uint32_t *v2Device = nullptr; - uint32_t *v3Host = nullptr; - uint32_t *v3Device = nullptr; - uint8_t *v4Host = nullptr; - uint8_t *v4Device = nullptr; - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) - deviceId = std::atoi(envDevice); - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMallocHost((void **)(&v3Host), fileSize_v3)); - ACL_CHECK(aclrtMallocHost((void **)(&v4Host), fileSize_v4)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v3Device, fileSize_v3, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v4Device, fileSize_v4, ACL_MEM_MALLOC_HUGE_FIRST)); - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ReadFile("./v3.bin", fileSize_v3, v3Host, fileSize_v3); - ReadFile("./v4.bin", fileSize_v4, v4Host, fileSize_v4); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v3Device, fileSize_v3, v3Host, fileSize_v3, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v4Device, fileSize_v4, v4Host, fileSize_v4, - ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVsubcs_kernel(v1Device, v2Device, v3Device, v4Device, stream); - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v3Host, fileSize_v3, v3Device, fileSize_v3, - ACL_MEMCPY_DEVICE_TO_HOST)); - ACL_CHECK(aclrtMemcpy(v4Host, fileSize_v4, v4Device, fileSize_v4, - ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v3.bin", v3Host, fileSize_v3); - WriteFile("./v4.bin", v4Host, fileSize_v4); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFree(v3Device); - aclrtFree(v4Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - aclrtFreeHost(v3Host); - aclrtFreeHost(v4Host); - if (stream != nullptr) - aclrtDestroyStream(stream); - if (deviceSet) - aclrtResetDevice(deviceId); - if (aclInited) - aclFinalize(); - return rc; -} diff --git a/test/vpto/cases/micro-op/vector-load-store/dma-copy-rearrange/kernel.pto b/test/vpto/cases/micro-op/vector-load-store/dma-copy-rearrange/kernel.pto index ae7db0d7a..92e8a8087 100644 --- a/test/vpto/cases/micro-op/vector-load-store/dma-copy-rearrange/kernel.pto +++ b/test/vpto/cases/micro-op/vector-load-store/dma-copy-rearrange/kernel.pto @@ -62,6 +62,175 @@ module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind, !pto.ptr, i64, i64, i64, i64, i64, i64 pto.barrier #pto.pipe + %__case_merge_guard = arith.constant false + + // inactive cross-case merge from micro-op/vector-load-store/vlds-ds-b16 + scf.if %__case_merge_guard { + + %c0_cmg21_1 = arith.constant 0 : index + %c1_cmg21_1 = arith.constant 1 : index + %c128_cmg21_1 = arith.constant 128 : index + %c1024_cmg21_1 = arith.constant 1024 : index + %c0_i64_cmg21_1 = arith.constant 0 : i64 + %c1_i64_cmg21_1 = arith.constant 1 : i64 + %c32_i64_cmg21_1 = arith.constant 32 : i64 + %c128_i64_cmg21_1 = arith.constant 128 : i64 + %c4096_i64_cmg21_1 = arith.constant 4096 : i64 + %c1024_i32_cmg21_1 = arith.constant 1024 : i32 + + %ub_in_cmg21_1 = pto.castptr %c0_i64_cmg21_1 : i64 -> !pto.ptr + %ub_out_cmg21_1 = pto.castptr %c4096_i64_cmg21_1 : i64 -> !pto.ptr + + %false_cmg21_1 = arith.constant false + pto.mte_gm_ub %arg0, %ub_in_cmg21_1, %c0_i64_cmg21_1, %c128_i64_cmg21_1 + nburst(%c32_i64_cmg21_1, %c128_i64_cmg21_1, %c128_i64_cmg21_1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %__cmg21_1:1 = scf.for %offset_cmg21_1 = %c0_cmg21_1 to %c1024_cmg21_1 step %c128_cmg21_1 iter_args(%remaining_cmg21_1 = %c1024_i32_cmg21_1) -> (i32) { + %mask_cmg21_1, %next_remaining_cmg21_1 = pto.plt_b16 %remaining_cmg21_1 : i32 -> !pto.mask, i32 + %out_cmg21_1 = pto.vlds %ub_in_cmg21_1[%offset_cmg21_1] {dist = "DS_B16"} : !pto.ptr -> !pto.vreg<128xi16> + pto.vsts %out_cmg21_1, %ub_out_cmg21_1[%offset_cmg21_1], %mask_cmg21_1 : !pto.vreg<128xi16>, !pto.ptr, !pto.mask + scf.yield %next_remaining_cmg21_1 : i32 + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_cmg21_1, %arg1, %c128_i64_cmg21_1 + nburst(%c32_i64_cmg21_1, %c128_i64_cmg21_1, %c128_i64_cmg21_1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + } + + // inactive cross-case merge from micro-op/vector-load-store/vlds-us-b16 + scf.if %__case_merge_guard { + + %c0_cmg21_2 = arith.constant 0 : index + %c1_cmg21_2 = arith.constant 1 : index + %c128_cmg21_2 = arith.constant 128 : index + %c1024_cmg21_2 = arith.constant 1024 : index + %c0_i64_cmg21_2 = arith.constant 0 : i64 + %c1_i64_cmg21_2 = arith.constant 1 : i64 + %c32_i64_cmg21_2 = arith.constant 32 : i64 + %c128_i64_cmg21_2 = arith.constant 128 : i64 + %c4096_i64_cmg21_2 = arith.constant 4096 : i64 + %c1024_i32_cmg21_2 = arith.constant 1024 : i32 + + %ub_in_cmg21_2 = pto.castptr %c0_i64_cmg21_2 : i64 -> !pto.ptr + %ub_out_cmg21_2 = pto.castptr %c4096_i64_cmg21_2 : i64 -> !pto.ptr + + %false_cmg21_2 = arith.constant false + pto.mte_gm_ub %arg0, %ub_in_cmg21_2, %c0_i64_cmg21_2, %c128_i64_cmg21_2 + nburst(%c32_i64_cmg21_2, %c128_i64_cmg21_2, %c128_i64_cmg21_2) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %__cmg21_2:1 = scf.for %offset_cmg21_2 = %c0_cmg21_2 to %c1024_cmg21_2 step %c128_cmg21_2 iter_args(%remaining_cmg21_2 = %c1024_i32_cmg21_2) -> (i32) { + %mask_cmg21_2, %next_remaining_cmg21_2 = pto.plt_b16 %remaining_cmg21_2 : i32 -> !pto.mask, i32 + %out_cmg21_2 = pto.vlds %ub_in_cmg21_2[%offset_cmg21_2] {dist = "US_B16"} : !pto.ptr -> !pto.vreg<128xi16> + pto.vsts %out_cmg21_2, %ub_out_cmg21_2[%offset_cmg21_2], %mask_cmg21_2 : !pto.vreg<128xi16>, !pto.ptr, !pto.mask + scf.yield %next_remaining_cmg21_2 : i32 + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_cmg21_2, %arg1, %c128_i64_cmg21_2 + nburst(%c32_i64_cmg21_2, %c128_i64_cmg21_2, %c128_i64_cmg21_2) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + } + + // inactive cross-case merge from micro-op/vector-load-store/vsts-1pt-b16 + scf.if %__case_merge_guard { + + %c0_cmg21_3 = arith.constant 0 : index + %c1_cmg21_3 = arith.constant 1 : index + %c128_cmg21_3 = arith.constant 128 : index + %c1024_cmg21_3 = arith.constant 1024 : index + %c0_i64_cmg21_3 = arith.constant 0 : i64 + %c1_i64_cmg21_3 = arith.constant 1 : i64 + %c32_i64_cmg21_3 = arith.constant 32 : i64 + %c128_i64_cmg21_3 = arith.constant 128 : i64 + %c4096_i64_cmg21_3 = arith.constant 4096 : i64 + %c1024_i32_cmg21_3 = arith.constant 1024 : i32 + + %ub_in_cmg21_3 = pto.castptr %c0_i64_cmg21_3 : i64 -> !pto.ptr + %ub_out_cmg21_3 = pto.castptr %c4096_i64_cmg21_3 : i64 -> !pto.ptr + + %false_cmg21_3 = arith.constant false + pto.mte_gm_ub %arg0, %ub_in_cmg21_3, %c0_i64_cmg21_3, %c128_i64_cmg21_3 + nburst(%c32_i64_cmg21_3, %c128_i64_cmg21_3, %c128_i64_cmg21_3) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %__cmg21_3:1 = scf.for %offset_cmg21_3 = %c0_cmg21_3 to %c1024_cmg21_3 step %c128_cmg21_3 iter_args(%remaining_cmg21_3 = %c1024_i32_cmg21_3) -> (i32) { + %mask_cmg21_3, %next_remaining_cmg21_3 = pto.plt_b16 %remaining_cmg21_3 : i32 -> !pto.mask, i32 + %vec_cmg21_3 = pto.vlds %ub_in_cmg21_3[%offset_cmg21_3] : !pto.ptr -> !pto.vreg<128xi16> + pto.vsts %vec_cmg21_3, %ub_out_cmg21_3[%offset_cmg21_3], %mask_cmg21_3 {dist = "1PT_B16"} : !pto.vreg<128xi16>, !pto.ptr, !pto.mask + scf.yield %next_remaining_cmg21_3 : i32 + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_cmg21_3, %arg1, %c128_i64_cmg21_3 + nburst(%c32_i64_cmg21_3, %c128_i64_cmg21_3, %c128_i64_cmg21_3) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + } + + // inactive cross-case merge from micro-op/vector-load-store/vsts-pk-b16 + scf.if %__case_merge_guard { + + %c0_cmg21_4 = arith.constant 0 : index + %c1_cmg21_4 = arith.constant 1 : index + %c128_cmg21_4 = arith.constant 128 : index + %c1024_cmg21_4 = arith.constant 1024 : index + %c0_i64_cmg21_4 = arith.constant 0 : i64 + %c1_i64_cmg21_4 = arith.constant 1 : i64 + %c32_i64_cmg21_4 = arith.constant 32 : i64 + %c128_i64_cmg21_4 = arith.constant 128 : i64 + %c4096_i64_cmg21_4 = arith.constant 4096 : i64 + %c1024_i32_cmg21_4 = arith.constant 1024 : i32 + + %ub_in_cmg21_4 = pto.castptr %c0_i64_cmg21_4 : i64 -> !pto.ptr + %ub_out_cmg21_4 = pto.castptr %c4096_i64_cmg21_4 : i64 -> !pto.ptr + + %false_cmg21_4 = arith.constant false + pto.mte_gm_ub %arg0, %ub_in_cmg21_4, %c0_i64_cmg21_4, %c128_i64_cmg21_4 + nburst(%c32_i64_cmg21_4, %c128_i64_cmg21_4, %c128_i64_cmg21_4) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %__cmg21_4:1 = scf.for %offset_cmg21_4 = %c0_cmg21_4 to %c1024_cmg21_4 step %c128_cmg21_4 iter_args(%remaining_cmg21_4 = %c1024_i32_cmg21_4) -> (i32) { + %mask_cmg21_4, %next_remaining_cmg21_4 = pto.plt_b16 %remaining_cmg21_4 : i32 -> !pto.mask, i32 + %vec_cmg21_4 = pto.vlds %ub_in_cmg21_4[%offset_cmg21_4] : !pto.ptr -> !pto.vreg<128xi16> + pto.vsts %vec_cmg21_4, %ub_out_cmg21_4[%offset_cmg21_4], %mask_cmg21_4 {dist = "PK_B16"} : !pto.vreg<128xi16>, !pto.ptr, !pto.mask + scf.yield %next_remaining_cmg21_4 : i32 + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_cmg21_4, %arg1, %c128_i64_cmg21_4 + nburst(%c32_i64_cmg21_4, %c128_i64_cmg21_4, %c128_i64_cmg21_4) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + } return } } diff --git a/test/vpto/cases/micro-op/vector-load-store/issue-173-vsts-signed-signless/kernel.pto b/test/vpto/cases/micro-op/vector-load-store/issue-173-vsts-signed-signless/kernel.pto index ae350dcc5..be1027706 100644 --- a/test/vpto/cases/micro-op/vector-load-store/issue-173-vsts-signed-signless/kernel.pto +++ b/test/vpto/cases/micro-op/vector-load-store/issue-173-vsts-signed-signless/kernel.pto @@ -5,77 +5,77 @@ // scenarios: signed-i16, signless-i16, same-module, issue-173-regression // ----------------------------------------------------------------------------- module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @copy_signed_i16_kernel(%arg0: !pto.ptr, - %arg1: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c128 = arith.constant 128 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c64_i64 = arith.constant 64 : i64 - %c2048_i64 = arith.constant 2048 : i64 + func.func @issue_173_vsts_signed_signless_deep_merged_kernel(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: !pto.ptr) attributes {pto.kernel} { + // active merged from copy_signed_i16_kernel - %ub_in = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c2048_i64 : i64 -> !pto.ptr + %c0_m0 = arith.constant 0 : index + %c128_m0 = arith.constant 128 : index + %c1024_m0 = arith.constant 1024 : index + %c0_i64_m0 = arith.constant 0 : i64 + %c1_i64_m0 = arith.constant 1 : i64 + %c32_i64_m0 = arith.constant 32 : i64 + %c64_i64_m0 = arith.constant 64 : i64 + %c2048_i64_m0 = arith.constant 2048 : i64 - %false = arith.constant false - pto.set_loop_size_outtoub %c1_i64, %c1_i64 : i64, i64 - pto.copy_gm_to_ubuf %arg0, %ub_in, %c0_i64, %c32_i64, %c64_i64, %c0_i64, %c0_i64, %false, %c0_i64, %c64_i64, %c64_i64 : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64, i1, i64, i64, i64 + %ub_in_m0 = pto.castptr %c0_i64_m0 : i64 -> !pto.ptr + %ub_out_m0 = pto.castptr %c2048_i64_m0 : i64 -> !pto.ptr + + %false_m0 = arith.constant false + pto.set_loop_size_outtoub %c1_i64_m0, %c1_i64_m0 : i64, i64 + pto.copy_gm_to_ubuf %arg0, %ub_in_m0, %c0_i64_m0, %c32_i64_m0, %c64_i64_m0, %c0_i64_m0, %c0_i64_m0, %false_m0, %c0_i64_m0, %c64_i64_m0, %c64_i64_m0 : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64, i1, i64, i64, i64 pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] pto.vecscope { - %mask = pto.pset_b16 "PAT_ALL" : !pto.mask - scf.for %offset = %c0 to %c1024 step %c128 { - %vec = pto.vlds %ub_in[%offset] : !pto.ptr -> !pto.vreg<128xsi16> - pto.vsts %vec, %ub_out[%offset], %mask : !pto.vreg<128xsi16>, !pto.ptr, !pto.mask + %mask_m0 = pto.pset_b16 "PAT_ALL" : !pto.mask + scf.for %offset_m0 = %c0_m0 to %c1024_m0 step %c128_m0 { + %vec_m0 = pto.vlds %ub_in_m0[%offset_m0] : !pto.ptr -> !pto.vreg<128xsi16> + pto.vsts %vec_m0, %ub_out_m0[%offset_m0], %mask_m0 : !pto.vreg<128xsi16>, !pto.ptr, !pto.mask } } pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.set_loop_size_ubtoout %c1_i64, %c1_i64 : i64, i64 - pto.copy_ubuf_to_gm %ub_out, %arg1, %c0_i64, %c32_i64, %c64_i64, %c0_i64, %c64_i64, %c64_i64 : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64, i64 + pto.set_loop_size_ubtoout %c1_i64_m0, %c1_i64_m0 : i64, i64 + pto.copy_ubuf_to_gm %ub_out_m0, %arg1, %c0_i64_m0, %c32_i64_m0, %c64_i64_m0, %c0_i64_m0, %c64_i64_m0, %c64_i64_m0 : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64, i64 pto.barrier #pto.pipe - return - } - func.func @copy_signless_i16_kernel(%arg0: !pto.ptr, - %arg1: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c128 = arith.constant 128 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c64_i64 = arith.constant 64 : i64 - %c2048_i64 = arith.constant 2048 : i64 + // active merged from copy_signless_i16_kernel - %ub_in = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c2048_i64 : i64 -> !pto.ptr + %c0_m1 = arith.constant 0 : index + %c128_m1 = arith.constant 128 : index + %c1024_m1 = arith.constant 1024 : index + %c0_i64_m1 = arith.constant 0 : i64 + %c1_i64_m1 = arith.constant 1 : i64 + %c32_i64_m1 = arith.constant 32 : i64 + %c64_i64_m1 = arith.constant 64 : i64 + %c2048_i64_m1 = arith.constant 2048 : i64 - %false = arith.constant false - pto.set_loop_size_outtoub %c1_i64, %c1_i64 : i64, i64 - pto.copy_gm_to_ubuf %arg0, %ub_in, %c0_i64, %c32_i64, %c64_i64, %c0_i64, %c0_i64, %false, %c0_i64, %c64_i64, %c64_i64 : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64, i1, i64, i64, i64 + %ub_in_m1 = pto.castptr %c0_i64_m1 : i64 -> !pto.ptr + %ub_out_m1 = pto.castptr %c2048_i64_m1 : i64 -> !pto.ptr + + %false_m1 = arith.constant false + pto.set_loop_size_outtoub %c1_i64_m1, %c1_i64_m1 : i64, i64 + pto.copy_gm_to_ubuf %arg2, %ub_in_m1, %c0_i64_m1, %c32_i64_m1, %c64_i64_m1, %c0_i64_m1, %c0_i64_m1, %false_m1, %c0_i64_m1, %c64_i64_m1, %c64_i64_m1 : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64, i1, i64, i64, i64 pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] pto.vecscope { - %mask = pto.pset_b16 "PAT_ALL" : !pto.mask - scf.for %offset = %c0 to %c1024 step %c128 { - %vec = pto.vlds %ub_in[%offset] : !pto.ptr -> !pto.vreg<128xi16> - pto.vsts %vec, %ub_out[%offset], %mask : !pto.vreg<128xi16>, !pto.ptr, !pto.mask + %mask_m1 = pto.pset_b16 "PAT_ALL" : !pto.mask + scf.for %offset_m1 = %c0_m1 to %c1024_m1 step %c128_m1 { + %vec_m1 = pto.vlds %ub_in_m1[%offset_m1] : !pto.ptr -> !pto.vreg<128xi16> + pto.vsts %vec_m1, %ub_out_m1[%offset_m1], %mask_m1 : !pto.vreg<128xi16>, !pto.ptr, !pto.mask } } pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.set_loop_size_ubtoout %c1_i64, %c1_i64 : i64, i64 - pto.copy_ubuf_to_gm %ub_out, %arg1, %c0_i64, %c32_i64, %c64_i64, %c0_i64, %c64_i64, %c64_i64 : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64, i64 + pto.set_loop_size_ubtoout %c1_i64_m1, %c1_i64_m1 : i64, i64 + pto.copy_ubuf_to_gm %ub_out_m1, %arg3, %c0_i64_m1, %c32_i64_m1, %c64_i64_m1, %c0_i64_m1, %c64_i64_m1, %c64_i64_m1 : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64, i64 pto.barrier #pto.pipe + return } } diff --git a/test/vpto/cases/micro-op/vector-load-store/issue-173-vsts-signed-signless/launch.cpp b/test/vpto/cases/micro-op/vector-load-store/issue-173-vsts-signed-signless/launch.cpp index 3153aaff6..671d7c2d5 100644 --- a/test/vpto/cases/micro-op/vector-load-store/issue-173-vsts-signed-signless/launch.cpp +++ b/test/vpto/cases/micro-op/vector-load-store/issue-173-vsts-signed-signless/launch.cpp @@ -39,17 +39,16 @@ struct MrgSortExecutedNumList { #include "acl/acl.h" #endif -extern "C" __global__ [aicore] void copy_signed_i16_kernel(__gm__ int16_t *v1, - __gm__ int16_t *v2); -extern "C" __global__ [aicore] void copy_signless_i16_kernel( - __gm__ int16_t *v3, __gm__ int16_t *v4); +extern "C" __global__ [aicore] void issue_173_vsts_signed_signless_deep_merged_kernel( + __gm__ int16_t * arg0, + __gm__ int16_t * arg1, + __gm__ int16_t * arg2, + __gm__ int16_t * arg3); -void LaunchCopySignedI16Kernel(int16_t *v1, int16_t *v2, void *stream) { - copy_signed_i16_kernel<<<1, nullptr, stream>>>((__gm__ int16_t *)v1, - (__gm__ int16_t *)v2); -} - -void LaunchCopySignlessI16Kernel(int16_t *v3, int16_t *v4, void *stream) { - copy_signless_i16_kernel<<<1, nullptr, stream>>>((__gm__ int16_t *)v3, - (__gm__ int16_t *)v4); +void LaunchIssue173VstsSignedSignlessDeepMerged(int16_t * p0, int16_t * p1, int16_t * p2, int16_t * p3, void *stream) { + issue_173_vsts_signed_signless_deep_merged_kernel<<<1, nullptr, stream>>>( + (__gm__ int16_t *)p0, + (__gm__ int16_t *)p1, + (__gm__ int16_t *)p2, + (__gm__ int16_t *)p3); } diff --git a/test/vpto/cases/micro-op/vector-load-store/issue-173-vsts-signed-signless/main.cpp b/test/vpto/cases/micro-op/vector-load-store/issue-173-vsts-signed-signless/main.cpp index 4500f5748..40869f975 100644 --- a/test/vpto/cases/micro-op/vector-load-store/issue-173-vsts-signed-signless/main.cpp +++ b/test/vpto/cases/micro-op/vector-load-store/issue-173-vsts-signed-signless/main.cpp @@ -54,9 +54,8 @@ struct MrgSortExecutedNumList { } \ } while (0) -void LaunchCopySignedI16Kernel(int16_t *v1, int16_t *v2, void *stream); -void LaunchCopySignlessI16Kernel(int16_t *v3, int16_t *v4, void *stream); +void LaunchIssue173VstsSignedSignlessDeepMerged(int16_t * p0, int16_t * p1, int16_t * p2, int16_t * p3, void *stream); int main() { constexpr size_t elemCount = 1024; constexpr size_t fileSize = elemCount * sizeof(int16_t); @@ -119,8 +118,13 @@ int main() { ACL_CHECK(aclrtMemcpy(v4Device, fileSize, v4Host, fileSize, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchCopySignedI16Kernel(v1Device, v2Device, stream); - LaunchCopySignlessI16Kernel(v3Device, v4Device, stream); + LaunchIssue173VstsSignedSignlessDeepMerged( + v1Device, + v2Device, + v3Device, + v4Device, + stream + ); ACL_CHECK(aclrtSynchronizeStream(stream)); ACL_CHECK(aclrtMemcpy(v2Host, fileSize, v2Device, fileSize, diff --git a/test/vpto/cases/micro-op/vector-load-store/vldas-vldus-state-chain/compare.py b/test/vpto/cases/micro-op/vector-load-store/vldas-vldus-state-chain/compare.py deleted file mode 100755 index bc0d8fc41..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vldas-vldus-state-chain/compare.py +++ /dev/null @@ -1,210 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/vector-load-store/vldas-vldus-state-chain -# family: vector-load-store -# target_ops: pto.vldas, pto.vldus -# scenarios: core-f32, full-mask, unaligned, stream-state, state-update -# NOTE: bulk-generated coverage skeleton. -# coding=utf-8 - -import os -import sys -import numpy as np - - -REPEAT_BYTES = 256 -PREFIX_ELEMS = 128 - - -def _ceil_div(x, y): - return (x + y - 1) // y - - -def _packed_pred_storage_bytes(logical_elems, src_elem_bytes): - logical_elems = int(logical_elems) - src_elem_bytes = int(src_elem_bytes) - if logical_elems <= 0: - raise ValueError(f"logical_elems must be > 0, got {logical_elems}") - if src_elem_bytes not in (1, 2, 4): - raise ValueError(f"unsupported packed predicate source size: {src_elem_bytes}") - - repeat_elems = REPEAT_BYTES // src_elem_bytes - if src_elem_bytes == 4: - repeat_times = _ceil_div(logical_elems, repeat_elems) + 1 - loop_count = repeat_times // 2 - return loop_count * 16 - - repeat_times = _ceil_div(logical_elems, repeat_elems) - return repeat_times * (repeat_elems // 8) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - g = golden.astype(np.int64, copy=False) - o = output.astype(np.int64, copy=False) - else: - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - abs_diff = np.abs(g - o) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={g[idx]}, out={o[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_bin_prefix(golden_path, output_path, dtype, eps, count): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - try: - count = int(count) - except Exception: - print(f"[ERROR] Invalid prefix count: {count}") - return False - if count <= 0: - print(f"[ERROR] Invalid prefix count: {count}") - return False - - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np, count=count) - output = np.fromfile(output_path, dtype=dtype_np, count=count) - - if golden.size != count or output.size != count: - print( - f"[ERROR] Prefix read too small: need={count} elems, " - f"golden={golden.size}, out={output.size}" - ) - return False - - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - g = golden.astype(np.int64, copy=False) - o = output.astype(np.int64, copy=False) - else: - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - abs_diff = np.abs(g - o) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch (prefix): {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={g[idx]}, out={o[idx]}, dtype={dtype_np}, count={count})" - ) - else: - print(f"[ERROR] Mismatch (prefix): {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, logical_elems, src_elem_bytes): - """ - Compare outputs of pto.tcmp / pto.tcmps. - - PTO-ISA stores packed predicate results as a linear PK byte stream via - `psts`, with the exact written prefix length determined by the typed - TCMP/TCMPS repeat schedule. Compare only that semantic prefix. - """ - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - try: - logical_elems = int(logical_elems) - src_elem_bytes = int(src_elem_bytes) - except Exception: - print( - "[ERROR] Invalid packed mask compare arguments: " - f"logical_elems={logical_elems} src_elem_bytes={src_elem_bytes}" - ) - return False - if logical_elems <= 0 or src_elem_bytes <= 0: - print( - "[ERROR] Invalid packed mask compare arguments: " - f"logical_elems={logical_elems} src_elem_bytes={src_elem_bytes}" - ) - return False - - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - try: - prefix_bytes = _packed_pred_storage_bytes(logical_elems, src_elem_bytes) - except ValueError as exc: - print(f"[ERROR] {exc}") - return False - - if golden.size < prefix_bytes or output.size < prefix_bytes: - print( - f"[ERROR] Packed mask buffer too small: need={prefix_bytes} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - - golden_sel = golden[:prefix_bytes] - output_sel = output[:prefix_bytes] - - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, " - f"logical_elems={logical_elems}, src_elem_bytes={src_elem_bytes}, prefix_bytes={prefix_bytes})" - ) - return False - return True - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = True - ok = compare_bin_prefix("golden_v2.bin", "v2.bin", np.float32, 0.0001, PREFIX_ELEMS) and ok - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/vector-load-store/vldas-vldus-state-chain/golden.py b/test/vpto/cases/micro-op/vector-load-store/vldas-vldus-state-chain/golden.py deleted file mode 100755 index 926db9342..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vldas-vldus-state-chain/golden.py +++ /dev/null @@ -1,56 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/vector-load-store/vldas-vldus-state-chain -# family: vector-load-store -# target_ops: pto.vldas, pto.vldus -# scenarios: core-f32, full-mask, unaligned, repeated-no-post -# NOTE: bulk-generated coverage skeleton. -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -ROWS = 32 -COLS = 32 -LANES = 64 -SEED = 19 - - -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) - v1 = rng.uniform(-8.0, 8.0, size=(ROWS, COLS)).astype(np.float32) - v2 = np.zeros((ROWS, COLS), dtype=np.float32) - golden_v2 = np.zeros((ROWS, COLS), dtype=np.float32) - flat_in = v1.reshape(-1) - flat_out = golden_v2.reshape(-1) - flat_out[:LANES] = flat_in[1 : 1 + LANES] - flat_out[LANES : 2 * LANES] = flat_in[65 : 65 + LANES] - - output_dir.mkdir(parents=True, exist_ok=True) - v1.reshape(-1).tofile(output_dir / "v1.bin") - v2.reshape(-1).tofile(output_dir / "v2.bin") - golden_v2.reshape(-1).tofile(output_dir / "golden_v2.bin") - - -def main() -> None: - parser = argparse.ArgumentParser( - description="Generate numpy-based inputs/golden for VPTO micro-op vldas-vldus state-chain validation." - ) - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/vector-load-store/vldas-vldus-state-chain/kernel.pto b/test/vpto/cases/micro-op/vector-load-store/vldas-vldus-state-chain/kernel.pto deleted file mode 100644 index e460e1308..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vldas-vldus-state-chain/kernel.pto +++ /dev/null @@ -1,61 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/vector-load-store/vldas-vldus-state-chain -// family: vector-load-store -// target_ops: pto.vldas, pto.vldus -// scenarios: core-f32, full-mask, unaligned, repeated-no-post -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -// Validate repeated no-post unaligned loads. Each `pto.vldus` is paired with -// its own `pto.vldas` and uses an explicit unaligned source pointer; the second -// load does not depend on state returned from the first one. - -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vldas_vldus_state_chain_kernel_2d(%arg0: !pto.ptr, %arg1: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c64 = arith.constant 64 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c128_i64 = arith.constant 128 : i64 - %c4096_i64 = arith.constant 4096 : i64 - %c1024_i32 = arith.constant 1024 : i32 - - %ub_in = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c4096_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_in, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - scf.for %offset = %c0 to %c64 step %c64 { - %src0 = pto.addptr %ub_in, %c1 : !pto.ptr -> !pto.ptr - %src1 = pto.addptr %src0, %c64 : !pto.ptr -> !pto.ptr - %align0 = pto.vldas %src0 : !pto.ptr -> !pto.align - %mask, %next_remaining = pto.plt_b32 %c1024_i32 : i32 -> !pto.mask, i32 - %out0, %align1 = pto.vldus %src0, %align0 - : !pto.ptr, !pto.align -> !pto.vreg<64xf32>, !pto.align - %align2 = pto.vldas %src1 : !pto.ptr -> !pto.align - %out1, %align3 = pto.vldus %src1, %align2 - : !pto.ptr, !pto.align -> !pto.vreg<64xf32>, !pto.align - pto.vsts %out0, %ub_out[%offset], %mask : !pto.vreg<64xf32>, !pto.ptr, !pto.mask - pto.vsts %out1, %ub_out[%c64], %mask : !pto.vreg<64xf32>, !pto.ptr, !pto.mask - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg1, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/vector-load-store/vldas-vldus-state-chain/launch.cpp b/test/vpto/cases/micro-op/vector-load-store/vldas-vldus-state-chain/launch.cpp deleted file mode 100644 index 6a077aa65..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vldas-vldus-state-chain/launch.cpp +++ /dev/null @@ -1,70 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/vector-load-store/vldas-vldus-state-chain -// family: vector-load-store -// target_ops: pto.vldas, pto.vldus -// scenarios: core-f32, full-mask, unaligned, stream-state, state-update -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -// --------------------------------------------------------------------------- -// PTOAS compatibility layer -// -// The upstream pto-isa headers reference some FP8/FP4 types and the -// __VEC_SCOPE__ marker that are not available on every AICore arch/toolchain -// combination (e.g. __NPU_ARCH__==2201). -// -// For our PTOAS-generated kernels we don't rely on these types today, but the -// headers still mention them in templates/static_asserts. Provide minimal -// fallbacks to keep compilation working on dav-c220. -// --------------------------------------------------------------------------- -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include - -// AICore printf support is gated behind `--cce-enable-print` on some -// toolchains. When enabled, include the CCE print header so `cce::printf` -// resolves in device compilation. -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -// Some PTO-ISA types are only available in the __CCE_AICORE__ compilation -// path, but `bisheng -xcce` still performs a host-side parse pass. -// Provide minimal fallbacks only when the corresponding header wasn't -// pulled in by the selected arch implementation. -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vldas_vldus_state_chain_kernel_2d(__gm__ float *v1, - __gm__ float *v2); - -void LaunchVldasVldusStateChain_kernel_2d(float *v1, float *v2, void *stream) { - vldas_vldus_state_chain_kernel_2d<<<1, nullptr, stream>>>((__gm__ float *)v1, (__gm__ float *)v2); -} diff --git a/test/vpto/cases/micro-op/vector-load-store/vldas-vldus-state-chain/main.cpp b/test/vpto/cases/micro-op/vector-load-store/vldas-vldus-state-chain/main.cpp deleted file mode 100644 index 95044a646..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vldas-vldus-state-chain/main.cpp +++ /dev/null @@ -1,130 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/vector-load-store/vldas-vldus-state-chain -// family: vector-load-store -// target_ops: pto.vldas, pto.vldus -// scenarios: core-f32, full-mask, unaligned, stream-state, state-update -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -This program is free software, you can redistribute it and/or modify it under the terms and conditions of -CANN Open Software License Agreement Version 2.0 (the "License"). -Please refer to the License for details. You may not use this file except in compliance with the License. -THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -See LICENSE in the root of the software repository for the full text of the License. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include -#include - -using namespace PtoTestCommon; - -#ifndef TMRGSORT_HPP -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, (int)_ret, __FILE__, __LINE__); \ - const char *_recent = aclGetRecentErrMsg(); \ - if (_recent != nullptr && _recent[0] != '\0') { \ - std::fprintf(stderr, "[ERROR] RecentErrMsg: %s\n", _recent); \ - } \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVldasVldusStateChain_kernel_2d(float *v1, float *v2, void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(float); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(float); - float *v1Host = nullptr; - float *v1Device = nullptr; - float *v2Host = nullptr; - float *v2Device = nullptr; - - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { - deviceId = std::atoi(envDevice); - } - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVldasVldusStateChain_kernel_2d(v1Device, v2Device, stream); - - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v2Host, fileSize_v2, v2Device, fileSize_v2, ACL_MEMCPY_DEVICE_TO_HOST)); - - WriteFile("./v2.bin", v2Host, fileSize_v2); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - if (stream != nullptr) { - const aclError _ret = aclrtDestroyStream(stream); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtDestroyStream(stream)", (int)_ret, __FILE__, __LINE__); - } - stream = nullptr; - } - if (deviceSet) { - const aclError _ret = aclrtResetDevice(deviceId); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtResetDevice(deviceId)", (int)_ret, __FILE__, __LINE__); - } - } - if (aclInited) { - const aclError _ret = aclFinalize(); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclFinalize()", (int)_ret, __FILE__, __LINE__); - } - } - - return rc; -} diff --git a/test/vpto/cases/micro-op/vector-load-store/vldas-vldus/kernel.pto b/test/vpto/cases/micro-op/vector-load-store/vldas-vldus/kernel.pto index c5e6da2b5..ef3fbaded 100644 --- a/test/vpto/cases/micro-op/vector-load-store/vldas-vldus/kernel.pto +++ b/test/vpto/cases/micro-op/vector-load-store/vldas-vldus/kernel.pto @@ -1,71 +1,462 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/vector-load-store/vldas-vldus -// family: vector-load-store -// target_ops: pto.vldas, pto.vldus -// scenarios: core-f32, full-mask, unaligned, stream-state -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -// ============================================================================= -// abs_kernel_2d: Element-wise absolute value on a 32x32 f32 tile -// ============================================================================= -// This kernel computes abs(input) for a 32x32 float32 matrix (1024 elements). -// -// Memory Layout: -// - Input: arg0 -> GM (Global Memory) -// - Output: arg1 -> GM (Global Memory) -// - UB (Unified Buffer) at offset 0: input tile (4096 bytes = 32*32*4) -// - UB at offset 4096: output tile (4096 bytes = 32*32*4) -// -// Pipeline: -// 1. DMA: GM -> UB (MTE2 pipe) - copy input tile to UB -// 2. Sync: wait for MTE2 -> V pipe handoff -// 3. Compute: vabs on 64-element vectors (V pipe) - 16 iterations for 1024 elements -// 4. Sync: wait for V -> MTE3 pipe handoff -// 5. DMA: UB -> GM (MTE3 pipe) - copy result tile back to GM -// ============================================================================= - module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vabs_kernel_2d(%arg0: !pto.ptr, %arg1: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c64 = arith.constant 64 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c128_i64 = arith.constant 128 : i64 - %c4096_i64 = arith.constant 4096 : i64 - %c1024_i32 = arith.constant 1024 : i32 - - %ub_in = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c4096_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_in, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) + func.func @vldas_vldus_deep_merged_kernel(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: !pto.ptr) attributes {pto.kernel} { + %__deep_merge_guard = arith.constant false + // inactive merged from vldas_vldus_state_chain_kernel_2d + scf.if %__deep_merge_guard { + + %c0_m0 = arith.constant 0 : index + %c1_m0 = arith.constant 1 : index + %c64_m0 = arith.constant 64 : index + %c1024_m0 = arith.constant 1024 : index + %c0_i64_m0 = arith.constant 0 : i64 + %c1_i64_m0 = arith.constant 1 : i64 + %c32_i64_m0 = arith.constant 32 : i64 + %c128_i64_m0 = arith.constant 128 : i64 + %c4096_i64_m0 = arith.constant 4096 : i64 + %c1024_i32_m0 = arith.constant 1024 : i32 + + %ub_in_m0 = pto.castptr %c0_i64_m0 : i64 -> !pto.ptr + %ub_out_m0 = pto.castptr %c4096_i64_m0 : i64 -> !pto.ptr + + %false_m0 = arith.constant false + pto.mte_gm_ub %arg0, %ub_in_m0, %c0_i64_m0, %c128_i64_m0 + nburst(%c32_i64_m0, %c128_i64_m0, %c128_i64_m0) : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] pto.vecscope { - scf.for %offset = %c0 to %c64 step %c64 { - %src0 = pto.addptr %ub_in, %c1 : !pto.ptr -> !pto.ptr - %align0 = pto.vldas %src0 : !pto.ptr -> !pto.align - %mask, %next_remaining = pto.plt_b32 %c1024_i32 : i32 -> !pto.mask, i32 - %out, %next_align = pto.vldus %src0, %align0 + scf.for %offset_m0 = %c0_m0 to %c64_m0 step %c64_m0 { + %src0_m0 = pto.addptr %ub_in_m0, %c1_m0 : !pto.ptr -> !pto.ptr + %src1_m0 = pto.addptr %src0_m0, %c64_m0 : !pto.ptr -> !pto.ptr + %align0_m0 = pto.vldas %src0_m0 : !pto.ptr -> !pto.align + %mask_m0, %next_remaining_m0 = pto.plt_b32 %c1024_i32_m0 : i32 -> !pto.mask, i32 + %out0_m0, %align1_m0 = pto.vldus %src0_m0, %align0_m0 + : !pto.ptr, !pto.align -> !pto.vreg<64xf32>, !pto.align + %align2_m0 = pto.vldas %src1_m0 : !pto.ptr -> !pto.align + %out1_m0, %align3_m0 = pto.vldus %src1_m0, %align2_m0 : !pto.ptr, !pto.align -> !pto.vreg<64xf32>, !pto.align - pto.vsts %out, %ub_out[%offset], %mask : !pto.vreg<64xf32>, !pto.ptr, !pto.mask + pto.vsts %out0_m0, %ub_out_m0[%offset_m0], %mask_m0 : !pto.vreg<64xf32>, !pto.ptr, !pto.mask + pto.vsts %out1_m0, %ub_out_m0[%c64_m0], %mask_m0 : !pto.vreg<64xf32>, !pto.ptr, !pto.mask } } pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg1, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) + pto.mte_ub_gm %ub_out_m0, %arg1, %c128_i64_m0 + nburst(%c32_i64_m0, %c128_i64_m0, %c128_i64_m0) : !pto.ptr, !pto.ptr, i64, i64, i64, i64 pto.barrier #pto.pipe + + } + // active merged from vabs_kernel_2d + + %c0_m1 = arith.constant 0 : index + %c1_m1 = arith.constant 1 : index + %c64_m1 = arith.constant 64 : index + %c1024_m1 = arith.constant 1024 : index + %c0_i64_m1 = arith.constant 0 : i64 + %c1_i64_m1 = arith.constant 1 : i64 + %c32_i64_m1 = arith.constant 32 : i64 + %c128_i64_m1 = arith.constant 128 : i64 + %c4096_i64_m1 = arith.constant 4096 : i64 + %c1024_i32_m1 = arith.constant 1024 : i32 + + %ub_in_m1 = pto.castptr %c0_i64_m1 : i64 -> !pto.ptr + %ub_out_m1 = pto.castptr %c4096_i64_m1 : i64 -> !pto.ptr + + %false_m1 = arith.constant false + pto.mte_gm_ub %arg2, %ub_in_m1, %c0_i64_m1, %c128_i64_m1 + nburst(%c32_i64_m1, %c128_i64_m1, %c128_i64_m1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + scf.for %offset_m1 = %c0_m1 to %c64_m1 step %c64_m1 { + %src0_m1 = pto.addptr %ub_in_m1, %c1_m1 : !pto.ptr -> !pto.ptr + %align0_m1 = pto.vldas %src0_m1 : !pto.ptr -> !pto.align + %mask_m1, %next_remaining_m1 = pto.plt_b32 %c1024_i32_m1 : i32 -> !pto.mask, i32 + %out_m1, %next_align_m1 = pto.vldus %src0_m1, %align0_m1 + : !pto.ptr, !pto.align -> !pto.vreg<64xf32>, !pto.align + pto.vsts %out_m1, %ub_out_m1[%offset_m1], %mask_m1 : !pto.vreg<64xf32>, !pto.ptr, !pto.mask + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m1, %arg3, %c128_i64_m1 + nburst(%c32_i64_m1, %c128_i64_m1, %c128_i64_m1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + %__case_merge_guard = arith.constant false + + // inactive cross-case merge from micro-op/vector-load-store/vlds + scf.if %__case_merge_guard { + + %__deep_merge_guard_cmg22_1 = arith.constant false + // inactive merged from vabs_kernel_2d_vlds_tail + scf.if %__deep_merge_guard_cmg22_1 { + + %c0_m0_cmg22_1 = arith.constant 0 : index + %c1_m0_cmg22_1 = arith.constant 1 : index + %c64_m0_cmg22_1 = arith.constant 64 : index + %c1024_m0_cmg22_1 = arith.constant 1024 : index + %c0_i64_m0_cmg22_1 = arith.constant 0 : i64 + %c1_i64_m0_cmg22_1 = arith.constant 1 : i64 + %c32_i64_m0_cmg22_1 = arith.constant 32 : i64 + %c128_i64_m0_cmg22_1 = arith.constant 128 : i64 + %c4096_i64_m0_cmg22_1 = arith.constant 4096 : i64 + %c1000_i32_m0_cmg22_1 = arith.constant 1000 : i32 + %c1024_i32_m0_cmg22_1 = arith.constant 1024 : i32 + + %ub_in_m0_cmg22_1 = pto.castptr %c0_i64_m0_cmg22_1 : i64 -> !pto.ptr + %ub_out_m0_cmg22_1 = pto.castptr %c4096_i64_m0_cmg22_1 : i64 -> !pto.ptr + + %false_m0_cmg22_1 = arith.constant false + pto.mte_gm_ub %arg0, %ub_in_m0_cmg22_1, %c0_i64_m0_cmg22_1, %c128_i64_m0_cmg22_1 + nburst(%c32_i64_m0_cmg22_1, %c128_i64_m0_cmg22_1, %c128_i64_m0_cmg22_1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %__m0_cmg22_1:1 = scf.for %offset_m0_cmg22_1 = %c0_m0_cmg22_1 to %c1024_m0_cmg22_1 step %c64_m0_cmg22_1 iter_args(%remaining_m0_cmg22_1 = %c1000_i32_m0_cmg22_1) -> (i32) { + %mask_m0_cmg22_1, %next_remaining_m0_cmg22_1 = pto.plt_b32 %remaining_m0_cmg22_1 : i32 -> !pto.mask, i32 + %out_m0_cmg22_1 = pto.vlds %ub_in_m0_cmg22_1[%offset_m0_cmg22_1] {dist = "NORM"} : !pto.ptr -> !pto.vreg<64xf32> + pto.vsts %out_m0_cmg22_1, %ub_out_m0_cmg22_1[%offset_m0_cmg22_1], %mask_m0_cmg22_1 : !pto.vreg<64xf32>, !pto.ptr, !pto.mask + scf.yield %next_remaining_m0_cmg22_1 : i32 + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m0_cmg22_1, %arg1, %c128_i64_m0_cmg22_1 + nburst(%c32_i64_m0_cmg22_1, %c128_i64_m0_cmg22_1, %c128_i64_m0_cmg22_1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + + } + // active merged from vabs_kernel_2d + + %c0_m1_cmg22_1 = arith.constant 0 : index + %c1_m1_cmg22_1 = arith.constant 1 : index + %c64_m1_cmg22_1 = arith.constant 64 : index + %c1024_m1_cmg22_1 = arith.constant 1024 : index + %c0_i64_m1_cmg22_1 = arith.constant 0 : i64 + %c1_i64_m1_cmg22_1 = arith.constant 1 : i64 + %c32_i64_m1_cmg22_1 = arith.constant 32 : i64 + %c128_i64_m1_cmg22_1 = arith.constant 128 : i64 + %c4096_i64_m1_cmg22_1 = arith.constant 4096 : i64 + %c1024_i32_m1_cmg22_1 = arith.constant 1024 : i32 + + %ub_in_m1_cmg22_1 = pto.castptr %c0_i64_m1_cmg22_1 : i64 -> !pto.ptr + %ub_out_m1_cmg22_1 = pto.castptr %c4096_i64_m1_cmg22_1 : i64 -> !pto.ptr + + %false_m1_cmg22_1 = arith.constant false + pto.mte_gm_ub %arg2, %ub_in_m1_cmg22_1, %c0_i64_m1_cmg22_1, %c128_i64_m1_cmg22_1 + nburst(%c32_i64_m1_cmg22_1, %c128_i64_m1_cmg22_1, %c128_i64_m1_cmg22_1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %__m1_cmg22_1:1 = scf.for %offset_m1_cmg22_1 = %c0_m1_cmg22_1 to %c1024_m1_cmg22_1 step %c64_m1_cmg22_1 iter_args(%remaining_m1_cmg22_1 = %c1024_i32_m1_cmg22_1) -> (i32) { + %mask_m1_cmg22_1, %next_remaining_m1_cmg22_1 = pto.plt_b32 %remaining_m1_cmg22_1 : i32 -> !pto.mask, i32 + %out_m1_cmg22_1 = pto.vlds %ub_in_m1_cmg22_1[%offset_m1_cmg22_1] {dist = "NORM"} : !pto.ptr -> !pto.vreg<64xf32> + pto.vsts %out_m1_cmg22_1, %ub_out_m1_cmg22_1[%offset_m1_cmg22_1], %mask_m1_cmg22_1 : !pto.vreg<64xf32>, !pto.ptr, !pto.mask + scf.yield %next_remaining_m1_cmg22_1 : i32 + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m1_cmg22_1, %arg3, %c128_i64_m1_cmg22_1 + nburst(%c32_i64_m1_cmg22_1, %c128_i64_m1_cmg22_1, %c128_i64_m1_cmg22_1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + } + + // inactive cross-case merge from micro-op/vector-load-store/vldsx2-vstsx2 + scf.if %__case_merge_guard { + + %__deep_merge_guard_cmg22_2 = arith.constant false + // inactive merged from vldx2_vstsx2_b8_f32_kernel + scf.if %__deep_merge_guard_cmg22_2 { + + %c0_m0_cmg22_2 = arith.constant 0 : index + %c1_m0_cmg22_2 = arith.constant 1 : index + %c8_m0_cmg22_2 = arith.constant 8 : index + %c128_m0_cmg22_2 = arith.constant 128 : index + %c1_i64_m0_cmg22_2 = arith.constant 1 : i64 + %c0_i64_m0_cmg22_2 = arith.constant 0 : i64 + %c32_i64_m0_cmg22_2 = arith.constant 32 : i64 + %c128_i64_m0_cmg22_2 = arith.constant 128 : i64 + %c4096_i64_m0_cmg22_2 = arith.constant 4096 : i64 + %c64_i32_m0_cmg22_2 = arith.constant 64 : i32 + %false_m0_cmg22_2 = arith.constant false + + %ub_in_m0_cmg22_2 = pto.castptr %c0_i64_m0_cmg22_2 : i64 -> !pto.ptr + %ub_out_m0_cmg22_2 = pto.castptr %c4096_i64_m0_cmg22_2 : i64 -> !pto.ptr + pto.mte_gm_ub %arg0, %ub_in_m0_cmg22_2, %c0_i64_m0_cmg22_2, %c128_i64_m0_cmg22_2 + nburst(%c32_i64_m0_cmg22_2, %c128_i64_m0_cmg22_2, %c128_i64_m0_cmg22_2) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + scf.for %group_m0_cmg22_2 = %c0_m0_cmg22_2 to %c8_m0_cmg22_2 step %c1_m0_cmg22_2 { + %group_base_m0_cmg22_2 = arith.muli %group_m0_cmg22_2, %c128_m0_cmg22_2 : index + scf.for %chunk_m0_cmg22_2 = %c0_m0_cmg22_2 to %c128_m0_cmg22_2 step %c128_m0_cmg22_2 { + %offset_m0_cmg22_2 = arith.addi %group_base_m0_cmg22_2, %chunk_m0_cmg22_2 : index + %mask_m0_cmg22_2, %remaining_m0_cmg22_2 = pto.plt_b32 %c64_i32_m0_cmg22_2 : i32 -> !pto.mask, i32 + %low_m0_cmg22_2, %high_m0_cmg22_2 = pto.vldsx2 %ub_in_m0_cmg22_2[%offset_m0_cmg22_2], "DINTLV_B8" + : !pto.ptr, index -> !pto.vreg<64xf32>, !pto.vreg<64xf32> + pto.vstsx2 %low_m0_cmg22_2, %high_m0_cmg22_2, %ub_out_m0_cmg22_2[%offset_m0_cmg22_2], "INTLV_B8", %mask_m0_cmg22_2 + : !pto.vreg<64xf32>, !pto.vreg<64xf32>, !pto.ptr, index, + !pto.mask + } + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m0_cmg22_2, %arg1, %c128_i64_m0_cmg22_2 + nburst(%c32_i64_m0_cmg22_2, %c128_i64_m0_cmg22_2, %c128_i64_m0_cmg22_2) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + + } + // active merged from vldx2_vstsx2_kernel + + %c0_m1_cmg22_2 = arith.constant 0 : index + %c1_m1_cmg22_2 = arith.constant 1 : index + %c8_m1_cmg22_2 = arith.constant 8 : index + %c128_m1_cmg22_2 = arith.constant 128 : index + %c1_i64_m1_cmg22_2 = arith.constant 1 : i64 + %c0_i64_m1_cmg22_2 = arith.constant 0 : i64 + %c32_i64_m1_cmg22_2 = arith.constant 32 : i64 + %c128_i64_m1_cmg22_2 = arith.constant 128 : i64 + %c4096_i64_m1_cmg22_2 = arith.constant 4096 : i64 + %c64_i32_m1_cmg22_2 = arith.constant 64 : i32 + %false_m1_cmg22_2 = arith.constant false + + %ub_in_m1_cmg22_2 = pto.castptr %c0_i64_m1_cmg22_2 : i64 -> !pto.ptr + %ub_out_m1_cmg22_2 = pto.castptr %c4096_i64_m1_cmg22_2 : i64 -> !pto.ptr + pto.mte_gm_ub %arg2, %ub_in_m1_cmg22_2, %c0_i64_m1_cmg22_2, %c128_i64_m1_cmg22_2 + nburst(%c32_i64_m1_cmg22_2, %c128_i64_m1_cmg22_2, %c128_i64_m1_cmg22_2) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + scf.for %group_m1_cmg22_2 = %c0_m1_cmg22_2 to %c8_m1_cmg22_2 step %c1_m1_cmg22_2 { + %group_base_m1_cmg22_2 = arith.muli %group_m1_cmg22_2, %c128_m1_cmg22_2 : index + scf.for %chunk_m1_cmg22_2 = %c0_m1_cmg22_2 to %c128_m1_cmg22_2 step %c128_m1_cmg22_2 { + %offset_m1_cmg22_2 = arith.addi %group_base_m1_cmg22_2, %chunk_m1_cmg22_2 : index + %mask_m1_cmg22_2, %remaining_m1_cmg22_2 = pto.plt_b32 %c64_i32_m1_cmg22_2 : i32 -> !pto.mask, i32 + %low_m1_cmg22_2, %high_m1_cmg22_2 = pto.vldsx2 %ub_in_m1_cmg22_2[%offset_m1_cmg22_2], "DINTLV_B32" + : !pto.ptr, index -> !pto.vreg<64xf32>, !pto.vreg<64xf32> + pto.vstsx2 %low_m1_cmg22_2, %high_m1_cmg22_2, %ub_out_m1_cmg22_2[%offset_m1_cmg22_2], "INTLV_B32", %mask_m1_cmg22_2 + : !pto.vreg<64xf32>, !pto.vreg<64xf32>, !pto.ptr, index, + !pto.mask + } + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m1_cmg22_2, %arg3, %c128_i64_m1_cmg22_2 + nburst(%c32_i64_m1_cmg22_2, %c128_i64_m1_cmg22_2, %c128_i64_m1_cmg22_2) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + } + + // inactive cross-case merge from micro-op/vector-load-store/vsts + scf.if %__case_merge_guard { + + %__deep_merge_guard_cmg22_3 = arith.constant false + // inactive merged from vsts_tail_kernel + scf.if %__deep_merge_guard_cmg22_3 { + + %c0_m0_cmg22_3 = arith.constant 0 : index + %c1_i64_m0_cmg22_3 = arith.constant 1 : i64 + %c0_i64_m0_cmg22_3 = arith.constant 0 : i64 + %c32_i64_m0_cmg22_3 = arith.constant 32 : i64 + %c128_i64_m0_cmg22_3 = arith.constant 128 : i64 + %c4096_i64_m0_cmg22_3 = arith.constant 4096 : i64 + %c13_i32_m0_cmg22_3 = arith.constant 13 : i32 + %false_m0_cmg22_3 = arith.constant false + + %ub_in_m0_cmg22_3 = pto.castptr %c0_i64_m0_cmg22_3 : i64 -> !pto.ptr + %ub_out_m0_cmg22_3 = pto.castptr %c4096_i64_m0_cmg22_3 : i64 -> !pto.ptr + pto.mte_gm_ub %arg0, %ub_in_m0_cmg22_3, %c0_i64_m0_cmg22_3, %c128_i64_m0_cmg22_3 + nburst(%c32_i64_m0_cmg22_3, %c128_i64_m0_cmg22_3, %c128_i64_m0_cmg22_3) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + %c1_m0_cmg22_3 = arith.constant 1 : index + pto.vecscope { + scf.for %iv_m0_cmg22_3 = %c0_m0_cmg22_3 to %c1_m0_cmg22_3 step %c1_m0_cmg22_3 { + %mask_m0_cmg22_3, %remaining_m0_cmg22_3 = pto.plt_b32 %c13_i32_m0_cmg22_3 : i32 -> !pto.mask, i32 + %vec_m0_cmg22_3 = pto.vlds %ub_in_m0_cmg22_3[%c0_m0_cmg22_3] : !pto.ptr -> !pto.vreg<64xf32> + pto.vsts %vec_m0_cmg22_3, %ub_out_m0_cmg22_3[%c0_m0_cmg22_3], %mask_m0_cmg22_3 + : !pto.vreg<64xf32>, !pto.ptr, !pto.mask + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m0_cmg22_3, %arg1, %c128_i64_m0_cmg22_3 + nburst(%c32_i64_m0_cmg22_3, %c128_i64_m0_cmg22_3, %c128_i64_m0_cmg22_3) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + + } + // active merged from vsts_kernel_2d + + %c0_m1_cmg22_3 = arith.constant 0 : index + %c1_m1_cmg22_3 = arith.constant 1 : index + %c64_m1_cmg22_3 = arith.constant 64 : index + %c1024_m1_cmg22_3 = arith.constant 1024 : index + %c0_i64_m1_cmg22_3 = arith.constant 0 : i64 + %c1_i64_m1_cmg22_3 = arith.constant 1 : i64 + %c32_i64_m1_cmg22_3 = arith.constant 32 : i64 + %c128_i64_m1_cmg22_3 = arith.constant 128 : i64 + %c4096_i64_m1_cmg22_3 = arith.constant 4096 : i64 + %c1024_i32_m1_cmg22_3 = arith.constant 1024 : i32 + + %ub_in_m1_cmg22_3 = pto.castptr %c0_i64_m1_cmg22_3 : i64 -> !pto.ptr + %ub_out_m1_cmg22_3 = pto.castptr %c4096_i64_m1_cmg22_3 : i64 -> !pto.ptr + + %false_m1_cmg22_3 = arith.constant false + pto.mte_gm_ub %arg2, %ub_in_m1_cmg22_3, %c0_i64_m1_cmg22_3, %c128_i64_m1_cmg22_3 + nburst(%c32_i64_m1_cmg22_3, %c128_i64_m1_cmg22_3, %c128_i64_m1_cmg22_3) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %__m1_cmg22_3:1 = scf.for %offset_m1_cmg22_3 = %c0_m1_cmg22_3 to %c1024_m1_cmg22_3 step %c64_m1_cmg22_3 iter_args(%remaining_m1_cmg22_3 = %c1024_i32_m1_cmg22_3) -> (i32) { + %mask_m1_cmg22_3, %next_remaining_m1_cmg22_3 = pto.plt_b32 %remaining_m1_cmg22_3 : i32 -> !pto.mask, i32 + %vec_m1_cmg22_3 = pto.vlds %ub_in_m1_cmg22_3[%offset_m1_cmg22_3] : !pto.ptr -> !pto.vreg<64xf32> + pto.vsts %vec_m1_cmg22_3, %ub_out_m1_cmg22_3[%offset_m1_cmg22_3], %mask_m1_cmg22_3 {dist = "NORM_B32"} : !pto.vreg<64xf32>, !pto.ptr, !pto.mask + scf.yield %next_remaining_m1_cmg22_3 : i32 + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m1_cmg22_3, %arg3, %c128_i64_m1_cmg22_3 + nburst(%c32_i64_m1_cmg22_3, %c128_i64_m1_cmg22_3, %c128_i64_m1_cmg22_3) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + } + + // inactive cross-case merge from micro-op/vector-load-store/vstur + scf.if %__case_merge_guard { + + %__deep_merge_guard_cmg22_4 = arith.constant false + // inactive merged from vstur_init_align_outside_loop_kernel_2d + scf.if %__deep_merge_guard_cmg22_4 { + + %c1_m0_cmg22_4 = arith.constant 1 : index + %c0_i64_m0_cmg22_4 = arith.constant 0 : i64 + %c1_i64_m0_cmg22_4 = arith.constant 1 : i64 + %c8_i32_m0_cmg22_4 = arith.constant 8 : i32 + %c32_i64_m0_cmg22_4 = arith.constant 32 : i64 + %c128_i64_m0_cmg22_4 = arith.constant 128 : i64 + %c4096_i64_m0_cmg22_4 = arith.constant 4096 : i64 + %c0_m0_cmg22_4 = arith.constant 0 : index + + %ub_in_m0_cmg22_4 = pto.castptr %c0_i64_m0_cmg22_4 : i64 -> !pto.ptr + %ub_out_m0_cmg22_4 = pto.castptr %c4096_i64_m0_cmg22_4 : i64 -> !pto.ptr + %ub_out1_m0_cmg22_4 = pto.addptr %ub_out_m0_cmg22_4, %c1_m0_cmg22_4 : !pto.ptr -> !pto.ptr + + %false_m0_cmg22_4 = arith.constant false + pto.mte_gm_ub %arg0, %ub_in_m0_cmg22_4, %c0_i64_m0_cmg22_4, %c128_i64_m0_cmg22_4 + nburst(%c32_i64_m0_cmg22_4, %c128_i64_m0_cmg22_4, %c128_i64_m0_cmg22_4) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + pto.sprclr "AR" + %align0_m0_cmg22_4 = pto.init_align : !pto.align + %align_final_m0_cmg22_4 = scf.for %offset_m0_cmg22_4 = %c0_m0_cmg22_4 to %c1_m0_cmg22_4 step %c1_m0_cmg22_4 + iter_args(%align_iter_m0_cmg22_4 = %align0_m0_cmg22_4) -> (!pto.align) { + %vec_m0_cmg22_4 = pto.vlds %ub_in_m0_cmg22_4[%c0_m0_cmg22_4] : !pto.ptr -> !pto.vreg<64xf32> + %mask_m0_cmg22_4, %unused_m0_cmg22_4 = pto.plt_b32 %c8_i32_m0_cmg22_4 : i32 -> !pto.mask, i32 + %sqz_m0_cmg22_4 = pto.vsqz %vec_m0_cmg22_4, %mask_m0_cmg22_4 : !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> + %align1_m0_cmg22_4 = pto.vstur %align_iter_m0_cmg22_4, %sqz_m0_cmg22_4, %ub_out1_m0_cmg22_4, "POST_UPDATE" + : !pto.align, !pto.vreg<64xf32>, !pto.ptr -> !pto.align + scf.yield %align1_m0_cmg22_4 : !pto.align + } + pto.vstar %align_final_m0_cmg22_4, %ub_out1_m0_cmg22_4 : !pto.align, !pto.ptr + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m0_cmg22_4, %arg1, %c128_i64_m0_cmg22_4 + nburst(%c32_i64_m0_cmg22_4, %c128_i64_m0_cmg22_4, %c128_i64_m0_cmg22_4) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + + } + // active merged from vstur_kernel_2d + + %c1_m1_cmg22_4 = arith.constant 1 : index + %c0_i64_m1_cmg22_4 = arith.constant 0 : i64 + %c1_i64_m1_cmg22_4 = arith.constant 1 : i64 + %c8_i32_m1_cmg22_4 = arith.constant 8 : i32 + %c32_i64_m1_cmg22_4 = arith.constant 32 : i64 + %c128_i64_m1_cmg22_4 = arith.constant 128 : i64 + %c4096_i64_m1_cmg22_4 = arith.constant 4096 : i64 + %c0_m1_cmg22_4 = arith.constant 0 : index + + %ub_in_m1_cmg22_4 = pto.castptr %c0_i64_m1_cmg22_4 : i64 -> !pto.ptr + %ub_out_m1_cmg22_4 = pto.castptr %c4096_i64_m1_cmg22_4 : i64 -> !pto.ptr + %ub_out1_m1_cmg22_4 = pto.addptr %ub_out_m1_cmg22_4, %c1_m1_cmg22_4 : !pto.ptr -> !pto.ptr + + %false_m1_cmg22_4 = arith.constant false + pto.mte_gm_ub %arg2, %ub_in_m1_cmg22_4, %c0_i64_m1_cmg22_4, %c128_i64_m1_cmg22_4 + nburst(%c32_i64_m1_cmg22_4, %c128_i64_m1_cmg22_4, %c128_i64_m1_cmg22_4) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + pto.sprclr "AR" + scf.for %offset_m1_cmg22_4 = %c0_m1_cmg22_4 to %c1_m1_cmg22_4 step %c1_m1_cmg22_4 { + %vec_m1_cmg22_4 = pto.vlds %ub_in_m1_cmg22_4[%c0_m1_cmg22_4] : !pto.ptr -> !pto.vreg<64xf32> + %mask_m1_cmg22_4, %unused_m1_cmg22_4 = pto.plt_b32 %c8_i32_m1_cmg22_4 : i32 -> !pto.mask, i32 + %sqz_m1_cmg22_4 = pto.vsqz %vec_m1_cmg22_4, %mask_m1_cmg22_4 : !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> + %align0_m1_cmg22_4 = pto.init_align : !pto.align + %align1_m1_cmg22_4 = pto.vstur %align0_m1_cmg22_4, %sqz_m1_cmg22_4, %ub_out1_m1_cmg22_4, "POST_UPDATE" + : !pto.align, !pto.vreg<64xf32>, !pto.ptr -> !pto.align + pto.vstar %align1_m1_cmg22_4, %ub_out1_m1_cmg22_4 : !pto.align, !pto.ptr + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m1_cmg22_4, %arg3, %c128_i64_m1_cmg22_4 + nburst(%c32_i64_m1_cmg22_4, %c128_i64_m1_cmg22_4, %c128_i64_m1_cmg22_4) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + } + return } } diff --git a/test/vpto/cases/micro-op/vector-load-store/vldas-vldus/launch.cpp b/test/vpto/cases/micro-op/vector-load-store/vldas-vldus/launch.cpp index b25715db0..b45c17f91 100644 --- a/test/vpto/cases/micro-op/vector-load-store/vldas-vldus/launch.cpp +++ b/test/vpto/cases/micro-op/vector-load-store/vldas-vldus/launch.cpp @@ -5,30 +5,9 @@ // THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, // INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. // See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/vector-load-store/vldas-vldus -// family: vector-load-store -// target_ops: pto.vldas, pto.vldus -// scenarios: core-f32, full-mask, unaligned, stream-state -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -// --------------------------------------------------------------------------- -// PTOAS compatibility layer -// -// The upstream pto-isa headers reference some FP8/FP4 types and the -// __VEC_SCOPE__ marker that are not available on every AICore arch/toolchain -// combination (e.g. __NPU_ARCH__==2201). -// -// For our PTOAS-generated kernels we don't rely on these types today, but the -// headers still mention them in templates/static_asserts. Provide minimal -// fallbacks to keep compilation working on dav-c220. -// --------------------------------------------------------------------------- #ifndef __VEC_SCOPE__ #define __VEC_SCOPE__ #endif - #if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) typedef struct { unsigned char v; } hifloat8_t; typedef struct { unsigned char v; } float8_e4m3_t; @@ -38,33 +17,26 @@ typedef struct { unsigned char v; } float4_e1m2x2_t; typedef struct { unsigned char v; } float4_e2m1x2_t; #endif #include - -// AICore printf support is gated behind `--cce-enable-print` on some -// toolchains. When enabled, include the CCE print header so `cce::printf` -// resolves in device compilation. #if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) #include #endif - -// Some PTO-ISA types are only available in the __CCE_AICORE__ compilation -// path, but `bisheng -xcce` still performs a host-side parse pass. -// Provide minimal fallbacks only when the corresponding header wasn't -// pulled in by the selected arch implementation. #if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; +struct MrgSortExecutedNumList { uint16_t mrgSortList0,mrgSortList1,mrgSortList2,mrgSortList3; }; #endif #ifndef __CPU_SIM #include "acl/acl.h" #endif -extern "C" __global__ [aicore] void vabs_kernel_2d(__gm__ float *v1, - __gm__ float *v2); +extern "C" __global__ [aicore] void vldas_vldus_deep_merged_kernel( + __gm__ float * arg0, + __gm__ float * arg1, + __gm__ float * arg2, + __gm__ float * arg3); -void LaunchVabs_kernel_2d(float *v1, float *v2, void *stream) { - vabs_kernel_2d<<<1, nullptr, stream>>>((__gm__ float *)v1, (__gm__ float *)v2); +void LaunchVldasVldusDeepMerged(float * p0, float * p1, void *stream) { + vldas_vldus_deep_merged_kernel<<<1, nullptr, stream>>>( + (__gm__ float *)p0, + (__gm__ float *)p0, + (__gm__ float *)p0, + (__gm__ float *)p1); } diff --git a/test/vpto/cases/micro-op/vector-load-store/vldas-vldus/main.cpp b/test/vpto/cases/micro-op/vector-load-store/vldas-vldus/main.cpp index 7bf75309b..25a8cc2da 100644 --- a/test/vpto/cases/micro-op/vector-load-store/vldas-vldus/main.cpp +++ b/test/vpto/cases/micro-op/vector-load-store/vldas-vldus/main.cpp @@ -55,8 +55,8 @@ struct MrgSortExecutedNumList { } \ } while (0) -void LaunchVabs_kernel_2d(float *v1, float *v2, void *stream); +void LaunchVldasVldusDeepMerged(float * p0, float * p1, void *stream); int main() { size_t elemCount_v1 = 1024; size_t fileSize_v1 = elemCount_v1 * sizeof(float); @@ -91,7 +91,7 @@ int main() { ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, ACL_MEMCPY_HOST_TO_DEVICE)); ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVabs_kernel_2d(v1Device, v2Device, stream); + LaunchVldasVldusDeepMerged(v1Device, v2Device, stream); ACL_CHECK(aclrtSynchronizeStream(stream)); ACL_CHECK(aclrtMemcpy(v2Host, fileSize_v2, v2Device, fileSize_v2, ACL_MEMCPY_DEVICE_TO_HOST)); diff --git a/test/vpto/cases/micro-op/vector-load-store/vlds-brc-b16-f32/compare.py b/test/vpto/cases/micro-op/vector-load-store/vlds-brc-b16-f32/compare.py deleted file mode 100644 index 4c19eb038..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vlds-brc-b16-f32/compare.py +++ /dev/null @@ -1,40 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - - -import os -import sys - -import numpy as np - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(golden_path) or not os.path.exists(output_path): - return False - golden = np.fromfile(golden_path, dtype=dtype) - output = np.fromfile(output_path, dtype=dtype) - return golden.shape == output.shape and np.allclose( - golden, output, atol=eps, rtol=eps, equal_nan=True - ) - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_bin("golden_v2.bin", "v2.bin", np.float32, 0.0) - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/vector-load-store/vlds-brc-b16-f32/golden.py b/test/vpto/cases/micro-op/vector-load-store/vlds-brc-b16-f32/golden.py deleted file mode 100644 index 8cc8dbe42..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vlds-brc-b16-f32/golden.py +++ /dev/null @@ -1,57 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/vector-load-store/vlds-brc-b16-f32 -# family: vector-load-store -# target_ops: pto.vlds -# scenarios: core-f32, full-mask, aligned, dist-brc-b16, width-agnostic-dist - -import argparse -from pathlib import Path - -import numpy as np - - -ELEMENTS = 1024 -LANES = 64 -SEED = 19 - - -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) - v1 = rng.uniform(-8.0, 8.0, size=(ELEMENTS,)).astype(np.float32) - v2 = np.zeros((ELEMENTS,), dtype=np.float32) - - src_bytes = v1.view(np.uint8) - golden_bytes = np.zeros_like(src_bytes) - chunk_bytes = LANES * 4 - for offset in range(0, src_bytes.size, chunk_bytes): - pattern = src_bytes[offset : offset + 2] - tiled = np.tile(pattern, chunk_bytes // 2) - golden_bytes[offset : offset + chunk_bytes] = tiled - golden_v2 = golden_bytes.view(np.float32) - - output_dir.mkdir(parents=True, exist_ok=True) - v1.tofile(output_dir / "v1.bin") - v2.tofile(output_dir / "v2.bin") - golden_v2.tofile(output_dir / "golden_v2.bin") - - -def main() -> None: - parser = argparse.ArgumentParser( - description="Generate numpy-based inputs/golden for VPTO micro-op vlds BRC_B16 on f32 validation." - ) - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/vector-load-store/vlds-brc-b16-f32/kernel.pto b/test/vpto/cases/micro-op/vector-load-store/vlds-brc-b16-f32/kernel.pto deleted file mode 100644 index e94fe5b01..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vlds-brc-b16-f32/kernel.pto +++ /dev/null @@ -1,47 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/vector-load-store/vlds-brc-b16-f32 -// family: vector-load-store -// target_ops: pto.vlds -// scenarios: core-f32, full-mask, aligned, dist-brc-b16, width-agnostic-dist -// ----------------------------------------------------------------------------- -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vlds_brc_b16_f32_kernel(%arg0: !pto.ptr, %arg1: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c64 = arith.constant 64 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c128_i64 = arith.constant 128 : i64 - %c4096_i64 = arith.constant 4096 : i64 - %c1024_i32 = arith.constant 1024 : i32 - - %ub_in = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c4096_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_in, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %_:1 = scf.for %offset = %c0 to %c1024 step %c64 iter_args(%remaining = %c1024_i32) -> (i32) { - %mask, %next_remaining = pto.plt_b32 %remaining : i32 -> !pto.mask, i32 - %out = pto.vlds %ub_in[%offset] {dist = "BRC_B16"} : !pto.ptr -> !pto.vreg<64xf32> - pto.vsts %out, %ub_out[%offset], %mask : !pto.vreg<64xf32>, !pto.ptr, !pto.mask - scf.yield %next_remaining : i32 - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg1, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/vector-load-store/vlds-brc-b16-f32/launch.cpp b/test/vpto/cases/micro-op/vector-load-store/vlds-brc-b16-f32/launch.cpp deleted file mode 100644 index 530496dba..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vlds-brc-b16-f32/launch.cpp +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include - -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vlds_brc_b16_f32_kernel(__gm__ float *v1, - __gm__ float *v2); - -void LaunchVlds_brc_b16_f32_kernel(float *v1, float *v2, void *stream) { - vlds_brc_b16_f32_kernel<<<1, nullptr, stream>>>((__gm__ float *)v1, (__gm__ float *)v2); -} - diff --git a/test/vpto/cases/micro-op/vector-load-store/vlds-brc-b16-f32/main.cpp b/test/vpto/cases/micro-op/vector-load-store/vlds-brc-b16-f32/main.cpp deleted file mode 100644 index 661e47152..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vlds-brc-b16-f32/main.cpp +++ /dev/null @@ -1,122 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -This program is free software, you can redistribute it and/or modify it under the terms and conditions of -CANN Open Software License Agreement Version 2.0 (the "License"). -Please refer to the License for details. You may not use this file except in compliance with the License. -THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -See LICENSE in the root of the software repository for the full text of the License. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include -#include - -using namespace PtoTestCommon; - -#ifndef TMRGSORT_HPP -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, (int)_ret, __FILE__, __LINE__); \ - const char *_recent = aclGetRecentErrMsg(); \ - if (_recent != nullptr && _recent[0] != '\0') { \ - std::fprintf(stderr, "[ERROR] RecentErrMsg: %s\n", _recent); \ - } \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVlds_brc_b16_f32_kernel(float *v1, float *v2, void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(float); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(float); - float *v1Host = nullptr; - float *v1Device = nullptr; - float *v2Host = nullptr; - float *v2Device = nullptr; - - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { - deviceId = std::atoi(envDevice); - } - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVlds_brc_b16_f32_kernel(v1Device, v2Device, stream); - - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v2Host, fileSize_v2, v2Device, fileSize_v2, ACL_MEMCPY_DEVICE_TO_HOST)); - - WriteFile("./v2.bin", v2Host, fileSize_v2); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - if (stream != nullptr) { - const aclError _ret = aclrtDestroyStream(stream); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtDestroyStream(stream)", (int)_ret, __FILE__, __LINE__); - } - stream = nullptr; - } - if (deviceSet) { - const aclError _ret = aclrtResetDevice(deviceId); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtResetDevice(deviceId)", (int)_ret, __FILE__, __LINE__); - } - } - if (aclInited) { - const aclError _ret = aclFinalize(); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclFinalize()", (int)_ret, __FILE__, __LINE__); - } - } - - return rc; -} diff --git a/test/vpto/cases/micro-op/vector-load-store/vlds-brc-b16/kernel.pto b/test/vpto/cases/micro-op/vector-load-store/vlds-brc-b16/kernel.pto index 95fe2fb62..85ff3e238 100644 --- a/test/vpto/cases/micro-op/vector-load-store/vlds-brc-b16/kernel.pto +++ b/test/vpto/cases/micro-op/vector-load-store/vlds-brc-b16/kernel.pto @@ -1,54 +1,87 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/vector-load-store/vlds-brc-b16 -// family: vector-load-store -// target_ops: pto.vlds -// scenarios: core-f16, full-mask, aligned, dist-brc-b16 -// ----------------------------------------------------------------------------- -// Validate one representative `BRC_B16` load on `b16`. -// The case keeps the structure minimal: -// 1. DMA one input tile into UB -// 2. issue `pto.vlds` with `dist = "BRC_B16"` inside `pto.vecscope` -// 3. store the resulting vector back through `pto.vsts` - module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vabs_kernel_2d(%arg0: !pto.ptr, %arg1: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c128 = arith.constant 128 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c128_i64 = arith.constant 128 : i64 - %c4096_i64 = arith.constant 4096 : i64 - %c1024_i32 = arith.constant 1024 : i32 - - %ub_in = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c4096_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_in, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) + func.func @vlds_brc_b16_deep_merged_kernel(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: !pto.ptr, %arg3: !pto.ptr) attributes {pto.kernel} { + %__deep_merge_guard = arith.constant false + // inactive merged from vlds_brc_b16_f32_kernel + scf.if %__deep_merge_guard { + + %c0_m0 = arith.constant 0 : index + %c64_m0 = arith.constant 64 : index + %c1024_m0 = arith.constant 1024 : index + %c0_i64_m0 = arith.constant 0 : i64 + %c1_i64_m0 = arith.constant 1 : i64 + %c32_i64_m0 = arith.constant 32 : i64 + %c128_i64_m0 = arith.constant 128 : i64 + %c4096_i64_m0 = arith.constant 4096 : i64 + %c1024_i32_m0 = arith.constant 1024 : i32 + + %ub_in_m0 = pto.castptr %c0_i64_m0 : i64 -> !pto.ptr + %ub_out_m0 = pto.castptr %c4096_i64_m0 : i64 -> !pto.ptr + + %false_m0 = arith.constant false + pto.mte_gm_ub %arg0, %ub_in_m0, %c0_i64_m0, %c128_i64_m0 + nburst(%c32_i64_m0, %c128_i64_m0, %c128_i64_m0) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %__m0:1 = scf.for %offset_m0 = %c0_m0 to %c1024_m0 step %c64_m0 iter_args(%remaining_m0 = %c1024_i32_m0) -> (i32) { + %mask_m0, %next_remaining_m0 = pto.plt_b32 %remaining_m0 : i32 -> !pto.mask, i32 + %out_m0 = pto.vlds %ub_in_m0[%offset_m0] {dist = "BRC_B16"} : !pto.ptr -> !pto.vreg<64xf32> + pto.vsts %out_m0, %ub_out_m0[%offset_m0], %mask_m0 : !pto.vreg<64xf32>, !pto.ptr, !pto.mask + scf.yield %next_remaining_m0 : i32 + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_m0, %arg1, %c128_i64_m0 + nburst(%c32_i64_m0, %c128_i64_m0, %c128_i64_m0) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + + } + // active merged from vabs_kernel_2d + + %c0_m1 = arith.constant 0 : index + %c1_m1 = arith.constant 1 : index + %c128_m1 = arith.constant 128 : index + %c1024_m1 = arith.constant 1024 : index + %c0_i64_m1 = arith.constant 0 : i64 + %c1_i64_m1 = arith.constant 1 : i64 + %c32_i64_m1 = arith.constant 32 : i64 + %c128_i64_m1 = arith.constant 128 : i64 + %c4096_i64_m1 = arith.constant 4096 : i64 + %c1024_i32_m1 = arith.constant 1024 : i32 + + %ub_in_m1 = pto.castptr %c0_i64_m1 : i64 -> !pto.ptr + %ub_out_m1 = pto.castptr %c4096_i64_m1 : i64 -> !pto.ptr + + %false_m1 = arith.constant false + pto.mte_gm_ub %arg2, %ub_in_m1, %c0_i64_m1, %c128_i64_m1 + nburst(%c32_i64_m1, %c128_i64_m1, %c128_i64_m1) : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] pto.vecscope { - %_:1 = scf.for %offset = %c0 to %c1024 step %c128 iter_args(%remaining = %c1024_i32) -> (i32) { - %mask, %next_remaining = pto.plt_b16 %remaining : i32 -> !pto.mask, i32 - %out = pto.vlds %ub_in[%offset] {dist = "BRC_B16"} : !pto.ptr -> !pto.vreg<128xf16> - pto.vsts %out, %ub_out[%offset], %mask : !pto.vreg<128xf16>, !pto.ptr, !pto.mask - scf.yield %next_remaining : i32 + %__m1:1 = scf.for %offset_m1 = %c0_m1 to %c1024_m1 step %c128_m1 iter_args(%remaining_m1 = %c1024_i32_m1) -> (i32) { + %mask_m1, %next_remaining_m1 = pto.plt_b16 %remaining_m1 : i32 -> !pto.mask, i32 + %out_m1 = pto.vlds %ub_in_m1[%offset_m1] {dist = "BRC_B16"} : !pto.ptr -> !pto.vreg<128xf16> + pto.vsts %out_m1, %ub_out_m1[%offset_m1], %mask_m1 : !pto.vreg<128xf16>, !pto.ptr, !pto.mask + scf.yield %next_remaining_m1 : i32 } } pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg1, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) + pto.mte_ub_gm %ub_out_m1, %arg3, %c128_i64_m1 + nburst(%c32_i64_m1, %c128_i64_m1, %c128_i64_m1) : !pto.ptr, !pto.ptr, i64, i64, i64, i64 pto.barrier #pto.pipe + return } } diff --git a/test/vpto/cases/micro-op/vector-load-store/vlds-brc-b16/launch.cpp b/test/vpto/cases/micro-op/vector-load-store/vlds-brc-b16/launch.cpp index 1d8ac9f5d..9f0329269 100644 --- a/test/vpto/cases/micro-op/vector-load-store/vlds-brc-b16/launch.cpp +++ b/test/vpto/cases/micro-op/vector-load-store/vlds-brc-b16/launch.cpp @@ -5,30 +5,9 @@ // THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, // INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. // See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/vector-load-store/vlds-brc-b16 -// family: vector-load-store -// target_ops: pto.vlds -// scenarios: core-f16, full-mask, aligned, dist-brc-b16 -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -// --------------------------------------------------------------------------- -// PTOAS compatibility layer -// -// The upstream pto-isa headers reference some FP8/FP4 types and the -// __VEC_SCOPE__ marker that are not available on every AICore arch/toolchain -// combination (e.g. __NPU_ARCH__==2201). -// -// For our PTOAS-generated kernels we don't rely on these types today, but the -// headers still mention them in templates/static_asserts. Provide minimal -// fallbacks to keep compilation working on dav-c220. -// --------------------------------------------------------------------------- #ifndef __VEC_SCOPE__ #define __VEC_SCOPE__ #endif - #if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) typedef struct { unsigned char v; } hifloat8_t; typedef struct { unsigned char v; } float8_e4m3_t; @@ -38,33 +17,26 @@ typedef struct { unsigned char v; } float4_e1m2x2_t; typedef struct { unsigned char v; } float4_e2m1x2_t; #endif #include - -// AICore printf support is gated behind `--cce-enable-print` on some -// toolchains. When enabled, include the CCE print header so `cce::printf` -// resolves in device compilation. #if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) #include #endif - -// Some PTO-ISA types are only available in the __CCE_AICORE__ compilation -// path, but `bisheng -xcce` still performs a host-side parse pass. -// Provide minimal fallbacks only when the corresponding header wasn't -// pulled in by the selected arch implementation. #if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; +struct MrgSortExecutedNumList { uint16_t mrgSortList0,mrgSortList1,mrgSortList2,mrgSortList3; }; #endif #ifndef __CPU_SIM #include "acl/acl.h" #endif -extern "C" __global__ [aicore] void vabs_kernel_2d(__gm__ float *v1, - __gm__ float *v2); +extern "C" __global__ [aicore] void vlds_brc_b16_deep_merged_kernel( + __gm__ float * arg0, + __gm__ float * arg1, + __gm__ half * arg2, + __gm__ half * arg3); -void LaunchVabs_kernel_2d(float *v1, float *v2, void *stream) { - vabs_kernel_2d<<<1, nullptr, stream>>>((__gm__ float *)v1, (__gm__ float *)v2); +void LaunchVldsBrcB16DeepMerged(float * p0, float * p1, void *stream) { + vlds_brc_b16_deep_merged_kernel<<<1, nullptr, stream>>>( + (__gm__ float *)p0, + (__gm__ float *)p0, + (__gm__ half *)p0, + (__gm__ half *)p1); } diff --git a/test/vpto/cases/micro-op/vector-load-store/vlds-brc-b16/main.cpp b/test/vpto/cases/micro-op/vector-load-store/vlds-brc-b16/main.cpp index cbec16893..f840bf677 100644 --- a/test/vpto/cases/micro-op/vector-load-store/vlds-brc-b16/main.cpp +++ b/test/vpto/cases/micro-op/vector-load-store/vlds-brc-b16/main.cpp @@ -55,8 +55,8 @@ struct MrgSortExecutedNumList { } \ } while (0) -void LaunchVabs_kernel_2d(float *v1, float *v2, void *stream); +void LaunchVldsBrcB16DeepMerged(float * p0, float * p1, void *stream); int main() { size_t elemCount_v1 = 1024; size_t fileSize_v1 = elemCount_v1 * sizeof(float); @@ -91,7 +91,7 @@ int main() { ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, ACL_MEMCPY_HOST_TO_DEVICE)); ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVabs_kernel_2d(v1Device, v2Device, stream); + LaunchVldsBrcB16DeepMerged(v1Device, v2Device, stream); ACL_CHECK(aclrtSynchronizeStream(stream)); ACL_CHECK(aclrtMemcpy(v2Host, fileSize_v2, v2Device, fileSize_v2, ACL_MEMCPY_DEVICE_TO_HOST)); diff --git a/test/vpto/cases/micro-op/vector-load-store/vlds-brc-b32/kernel.pto b/test/vpto/cases/micro-op/vector-load-store/vlds-brc-b32/kernel.pto index fac81676f..c9f869761 100644 --- a/test/vpto/cases/micro-op/vector-load-store/vlds-brc-b32/kernel.pto +++ b/test/vpto/cases/micro-op/vector-load-store/vlds-brc-b32/kernel.pto @@ -64,6 +64,420 @@ module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind, !pto.ptr, i64, i64, i64, i64 pto.barrier #pto.pipe + %__case_merge_guard = arith.constant false + + // inactive cross-case merge from micro-op/vector-load-store/vlds-brc-b8-f32 + scf.if %__case_merge_guard { + + %c0_cmg23_1 = arith.constant 0 : index + %c64_cmg23_1 = arith.constant 64 : index + %c1024_cmg23_1 = arith.constant 1024 : index + %c0_i64_cmg23_1 = arith.constant 0 : i64 + %c1_i64_cmg23_1 = arith.constant 1 : i64 + %c32_i64_cmg23_1 = arith.constant 32 : i64 + %c128_i64_cmg23_1 = arith.constant 128 : i64 + %c4096_i64_cmg23_1 = arith.constant 4096 : i64 + %c1024_i32_cmg23_1 = arith.constant 1024 : i32 + + %ub_in_cmg23_1 = pto.castptr %c0_i64_cmg23_1 : i64 -> !pto.ptr + %ub_out_cmg23_1 = pto.castptr %c4096_i64_cmg23_1 : i64 -> !pto.ptr + + %false_cmg23_1 = arith.constant false + pto.mte_gm_ub %arg0, %ub_in_cmg23_1, %c0_i64_cmg23_1, %c128_i64_cmg23_1 + nburst(%c32_i64_cmg23_1, %c128_i64_cmg23_1, %c128_i64_cmg23_1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %__cmg23_1:1 = scf.for %offset_cmg23_1 = %c0_cmg23_1 to %c1024_cmg23_1 step %c64_cmg23_1 iter_args(%remaining_cmg23_1 = %c1024_i32_cmg23_1) -> (i32) { + %mask_cmg23_1, %next_remaining_cmg23_1 = pto.plt_b32 %remaining_cmg23_1 : i32 -> !pto.mask, i32 + %out_cmg23_1 = pto.vlds %ub_in_cmg23_1[%offset_cmg23_1] {dist = "BRC_B8"} : !pto.ptr -> !pto.vreg<64xf32> + pto.vsts %out_cmg23_1, %ub_out_cmg23_1[%offset_cmg23_1], %mask_cmg23_1 : !pto.vreg<64xf32>, !pto.ptr, !pto.mask + scf.yield %next_remaining_cmg23_1 : i32 + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_cmg23_1, %arg1, %c128_i64_cmg23_1 + nburst(%c32_i64_cmg23_1, %c128_i64_cmg23_1, %c128_i64_cmg23_1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + } + + // inactive cross-case merge from micro-op/vector-load-store/vlds-dma-loop + scf.if %__case_merge_guard { + + %c0_cmg23_2 = arith.constant 0 : index + %c1_cmg23_2 = arith.constant 1 : index + %c64_cmg23_2 = arith.constant 64 : index + %c512_cmg23_2 = arith.constant 512 : index + %c0_i64_cmg23_2 = arith.constant 0 : i64 + %c1_i64_cmg23_2 = arith.constant 1 : i64 + %c2_i64_cmg23_2 = arith.constant 2 : i64 + %c8_i64_cmg23_2 = arith.constant 8 : i64 + %c224_i64_cmg23_2 = arith.constant 224 : i64 + %c256_i64_cmg23_2 = arith.constant 256 : i64 + %c448_i64_cmg23_2 = arith.constant 448 : i64 + %c512_i64_cmg23_2 = arith.constant 512 : i64 + %c896_i64_cmg23_2 = arith.constant 896 : i64 + %c1024_i64_cmg23_2 = arith.constant 1024 : i64 + %c2048_i64_cmg23_2 = arith.constant 2048 : i64 + %c512_i32_cmg23_2 = arith.constant 512 : i32 + %pad_cmg23_2 = arith.constant 1.000000e+00 : f32 + + %ub_in_cmg23_2 = pto.castptr %c0_i64_cmg23_2 : i64 -> !pto.ptr + %ub_out_cmg23_2 = pto.castptr %c2048_i64_cmg23_2 : i64 -> !pto.ptr + + pto.mte_gm_ub %arg0, %ub_in_cmg23_2, %c0_i64_cmg23_2, %c224_i64_cmg23_2 + nburst(%c1_i64_cmg23_2, %c224_i64_cmg23_2, %c256_i64_cmg23_2) + loop(%c2_i64_cmg23_2, %c224_i64_cmg23_2, %c256_i64_cmg23_2) + loop(%c2_i64_cmg23_2, %c448_i64_cmg23_2, %c512_i64_cmg23_2) + loop(%c2_i64_cmg23_2, %c896_i64_cmg23_2, %c1024_i64_cmg23_2) + pad(%pad_cmg23_2, %c0_i64_cmg23_2, %c8_i64_cmg23_2) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64, + loop i64, i64, i64, loop i64, i64, i64, loop i64, i64, i64, pad f32, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %__cmg23_2:1 = scf.for %offset_cmg23_2 = %c0_cmg23_2 to %c512_cmg23_2 step %c64_cmg23_2 iter_args(%remaining_cmg23_2 = %c512_i32_cmg23_2) -> (i32) { + %mask_cmg23_2, %next_remaining_cmg23_2 = pto.plt_b32 %remaining_cmg23_2 : i32 -> !pto.mask, i32 + %value_cmg23_2 = pto.vlds %ub_in_cmg23_2[%offset_cmg23_2] {dist = "NORM"} : !pto.ptr -> !pto.vreg<64xf32> + pto.vsts %value_cmg23_2, %ub_out_cmg23_2[%offset_cmg23_2], %mask_cmg23_2 : !pto.vreg<64xf32>, !pto.ptr, !pto.mask + scf.yield %next_remaining_cmg23_2 : i32 + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + + pto.mte_ub_gm %ub_out_cmg23_2, %arg1, %c256_i64_cmg23_2 + nburst(%c1_i64_cmg23_2, %c256_i64_cmg23_2, %c256_i64_cmg23_2) + loop(%c2_i64_cmg23_2, %c256_i64_cmg23_2, %c256_i64_cmg23_2) + loop(%c2_i64_cmg23_2, %c512_i64_cmg23_2, %c512_i64_cmg23_2) + loop(%c2_i64_cmg23_2, %c1024_i64_cmg23_2, %c1024_i64_cmg23_2) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, + loop i64, i64, i64, loop i64, i64, i64, loop i64, i64, i64 + + pto.barrier #pto.pipe + } + + // inactive cross-case merge from micro-op/vector-load-store/vldsx2-layout-check + scf.if %__case_merge_guard { + + %c0_cmg23_3 = arith.constant 0 : index + %c1_cmg23_3 = arith.constant 1 : index + %c64_cmg23_3 = arith.constant 64 : index + %c8_cmg23_3 = arith.constant 8 : index + %c128_cmg23_3 = arith.constant 128 : index + %c1_i64_cmg23_3 = arith.constant 1 : i64 + %c0_i64_cmg23_3 = arith.constant 0 : i64 + %c32_i64_cmg23_3 = arith.constant 32 : i64 + %c128_i64_cmg23_3 = arith.constant 128 : i64 + %c4096_i64_cmg23_3 = arith.constant 4096 : i64 + %c64_i32_cmg23_3 = arith.constant 64 : i32 + %false_cmg23_3 = arith.constant false + + %ub_in_cmg23_3 = pto.castptr %c0_i64_cmg23_3 : i64 -> !pto.ptr + %ub_out_cmg23_3 = pto.castptr %c4096_i64_cmg23_3 : i64 -> !pto.ptr + pto.mte_gm_ub %arg0, %ub_in_cmg23_3, %c0_i64_cmg23_3, %c128_i64_cmg23_3 + nburst(%c32_i64_cmg23_3, %c128_i64_cmg23_3, %c128_i64_cmg23_3) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + scf.for %group_cmg23_3 = %c0_cmg23_3 to %c8_cmg23_3 step %c1_cmg23_3 { + %group_base_cmg23_3 = arith.muli %group_cmg23_3, %c128_cmg23_3 : index + scf.for %chunk_cmg23_3 = %c0_cmg23_3 to %c128_cmg23_3 step %c128_cmg23_3 { + %offset_cmg23_3 = arith.addi %group_base_cmg23_3, %chunk_cmg23_3 : index + %high_offset_cmg23_3 = arith.addi %offset_cmg23_3, %c64_cmg23_3 : index + %mask_cmg23_3, %remaining_cmg23_3 = pto.plt_b32 %c64_i32_cmg23_3 : i32 -> !pto.mask, i32 + %x_cmg23_3, %y_cmg23_3 = pto.vldsx2 %ub_in_cmg23_3[%offset_cmg23_3], "DINTLV_B32" + : !pto.ptr, index -> !pto.vreg<64xf32>, !pto.vreg<64xf32> + pto.vsts %x_cmg23_3, %ub_out_cmg23_3[%offset_cmg23_3], %mask_cmg23_3 {dist = "NORM_B32"} + : !pto.vreg<64xf32>, !pto.ptr, !pto.mask + pto.vsts %y_cmg23_3, %ub_out_cmg23_3[%high_offset_cmg23_3], %mask_cmg23_3 {dist = "NORM_B32"} + : !pto.vreg<64xf32>, !pto.ptr, !pto.mask + } + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_cmg23_3, %arg1, %c128_i64_cmg23_3 + nburst(%c32_i64_cmg23_3, %c128_i64_cmg23_3, %c128_i64_cmg23_3) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + } + + // inactive cross-case merge from micro-op/vector-load-store/vsldb + scf.if %__case_merge_guard { + + %c0_cmg23_4 = arith.constant 0 : index + %c1_i64_cmg23_4 = arith.constant 1 : i64 + %c0_i64_cmg23_4 = arith.constant 0 : i64 + %c2_i16_cmg23_4 = arith.constant 2 : i16 + %c4_i16_cmg23_4 = arith.constant 4 : i16 + %c32_i64_cmg23_4 = arith.constant 32 : i64 + %c64_i32_cmg23_4 = arith.constant 64 : i32 + %c128_i64_cmg23_4 = arith.constant 128 : i64 + %c4096_i64_cmg23_4 = arith.constant 4096 : i64 + + %ub_in_cmg23_4 = pto.castptr %c0_i64_cmg23_4 : i64 -> !pto.ptr + %ub_out_cmg23_4 = pto.castptr %c4096_i64_cmg23_4 : i64 -> !pto.ptr + + %false_cmg23_4 = arith.constant false + pto.mte_gm_ub %arg0, %ub_in_cmg23_4, %c0_i64_cmg23_4, %c128_i64_cmg23_4 + nburst(%c32_i64_cmg23_4, %c128_i64_cmg23_4, %c128_i64_cmg23_4) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + %c1_cmg23_4 = arith.constant 1 : index + pto.vecscope { + scf.for %iv_cmg23_4 = %c0_cmg23_4 to %c1_cmg23_4 step %c1_cmg23_4 { + %mask_cmg23_4, %next_remaining_cmg23_4 = pto.plt_b32 %c64_i32_cmg23_4 : i32 -> !pto.mask, i32 + %loaded_cmg23_4 = pto.vsldb %ub_in_cmg23_4, %c2_i16_cmg23_4, %c4_i16_cmg23_4, %mask_cmg23_4 : !pto.ptr, i16, i16, !pto.mask -> !pto.vreg<64xf32> + pto.vsts %loaded_cmg23_4, %ub_out_cmg23_4[%c0_cmg23_4], %mask_cmg23_4 : !pto.vreg<64xf32>, !pto.ptr, !pto.mask + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_cmg23_4, %arg1, %c128_i64_cmg23_4 + nburst(%c32_i64_cmg23_4, %c128_i64_cmg23_4, %c128_i64_cmg23_4) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + } + + // inactive cross-case merge from micro-op/vector-load-store/vsstb + scf.if %__case_merge_guard { + + %c0_cmg23_5 = arith.constant 0 : index + %c1_i64_cmg23_5 = arith.constant 1 : i64 + %c0_i64_cmg23_5 = arith.constant 0 : i64 + %c2_i16_cmg23_5 = arith.constant 2 : i16 + %c4_i16_cmg23_5 = arith.constant 4 : i16 + %c32_i64_cmg23_5 = arith.constant 32 : i64 + %c64_i32_cmg23_5 = arith.constant 64 : i32 + %c128_i64_cmg23_5 = arith.constant 128 : i64 + %c4096_i64_cmg23_5 = arith.constant 4096 : i64 + + %ub_in_cmg23_5 = pto.castptr %c0_i64_cmg23_5 : i64 -> !pto.ptr + %ub_out_cmg23_5 = pto.castptr %c4096_i64_cmg23_5 : i64 -> !pto.ptr + + %false_cmg23_5 = arith.constant false + pto.mte_gm_ub %arg0, %ub_in_cmg23_5, %c0_i64_cmg23_5, %c128_i64_cmg23_5 + nburst(%c32_i64_cmg23_5, %c128_i64_cmg23_5, %c128_i64_cmg23_5) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + %c1_cmg23_5 = arith.constant 1 : index + pto.vecscope { + scf.for %iv_cmg23_5 = %c0_cmg23_5 to %c1_cmg23_5 step %c1_cmg23_5 { + %mask_cmg23_5, %next_remaining_cmg23_5 = pto.plt_b32 %c64_i32_cmg23_5 : i32 -> !pto.mask, i32 + %value_cmg23_5 = pto.vlds %ub_in_cmg23_5[%c0_cmg23_5] : !pto.ptr -> !pto.vreg<64xf32> + pto.vsstb %value_cmg23_5, %ub_out_cmg23_5, %c2_i16_cmg23_5, %c4_i16_cmg23_5, %mask_cmg23_5 : !pto.vreg<64xf32>, !pto.ptr, i16, i16, !pto.mask + pto.mem_bar "VST_VLD" + %roundtrip_cmg23_5 = pto.vsldb %ub_out_cmg23_5, %c2_i16_cmg23_5, %c4_i16_cmg23_5, %mask_cmg23_5 : !pto.ptr, i16, i16, !pto.mask -> !pto.vreg<64xf32> + pto.vsts %roundtrip_cmg23_5, %ub_in_cmg23_5[%c0_cmg23_5], %mask_cmg23_5 : !pto.vreg<64xf32>, !pto.ptr, !pto.mask + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_in_cmg23_5, %arg1, %c128_i64_cmg23_5 + nburst(%c32_i64_cmg23_5, %c128_i64_cmg23_5, %c128_i64_cmg23_5) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + } + + // inactive cross-case merge from micro-op/vector-load-store/vstar + scf.if %__case_merge_guard { + + %c0_cmg23_6 = arith.constant 0 : index + %c1_cmg23_6 = arith.constant 1 : index + %c0_i64_cmg23_6 = arith.constant 0 : i64 + %c1_i64_cmg23_6 = arith.constant 1 : i64 + %c8_i32_cmg23_6 = arith.constant 8 : i32 + %c32_i64_cmg23_6 = arith.constant 32 : i64 + %c128_i64_cmg23_6 = arith.constant 128 : i64 + %c4096_i64_cmg23_6 = arith.constant 4096 : i64 + %c1_elem_cmg23_6 = arith.constant 1 : index + + %ub_in_cmg23_6 = pto.castptr %c0_i64_cmg23_6 : i64 -> !pto.ptr + %ub_out_cmg23_6 = pto.castptr %c4096_i64_cmg23_6 : i64 -> !pto.ptr + %ub_out1_cmg23_6 = pto.addptr %ub_out_cmg23_6, %c1_elem_cmg23_6 : !pto.ptr -> !pto.ptr + + %false_cmg23_6 = arith.constant false + pto.mte_gm_ub %arg0, %ub_in_cmg23_6, %c0_i64_cmg23_6, %c128_i64_cmg23_6 + nburst(%c32_i64_cmg23_6, %c128_i64_cmg23_6, %c128_i64_cmg23_6) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + pto.sprclr "AR" + scf.for %iter_cmg23_6 = %c0_cmg23_6 to %c1_cmg23_6 step %c1_cmg23_6 { + %vec_cmg23_6 = pto.vlds %ub_in_cmg23_6[%c0_cmg23_6] : !pto.ptr -> !pto.vreg<64xf32> + %mask_cmg23_6, %unused_cmg23_6 = pto.plt_b32 %c8_i32_cmg23_6 : i32 -> !pto.mask, i32 + %sqz_cmg23_6 = pto.vsqz %vec_cmg23_6, %mask_cmg23_6 : !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> + %align0_cmg23_6 = pto.init_align : !pto.align + %align1_cmg23_6 = pto.vstur %align0_cmg23_6, %sqz_cmg23_6, %ub_out1_cmg23_6, "POST_UPDATE" + : !pto.align, !pto.vreg<64xf32>, !pto.ptr -> !pto.align + pto.vstar %align1_cmg23_6, %ub_out1_cmg23_6 : !pto.align, !pto.ptr + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_cmg23_6, %arg1, %c128_i64_cmg23_6 + nburst(%c32_i64_cmg23_6, %c128_i64_cmg23_6, %c128_i64_cmg23_6) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + } + + // inactive cross-case merge from micro-op/vector-load-store/vstas-vstus-offset-update + scf.if %__case_merge_guard { + + %c0_cmg23_7 = arith.constant 0 : index + %c1_cmg23_7 = arith.constant 1 : index + %c64_cmg23_7 = arith.constant 64 : index + %c0_i64_cmg23_7 = arith.constant 0 : i64 + %c1_i64_cmg23_7 = arith.constant 1 : i64 + %c32_i64_cmg23_7 = arith.constant 32 : i64 + %c128_i64_cmg23_7 = arith.constant 128 : i64 + %c4096_i64_cmg23_7 = arith.constant 4096 : i64 + %c3_i32_cmg23_7 = arith.constant 3 : i32 + %c0_i32_cmg23_7 = arith.constant 0 : i32 + + %ub_in_cmg23_7 = pto.castptr %c0_i64_cmg23_7 : i64 -> !pto.ptr + %ub_out_cmg23_7 = pto.castptr %c4096_i64_cmg23_7 : i64 -> !pto.ptr + + %false_cmg23_7 = arith.constant false + pto.mte_gm_ub %arg0, %ub_in_cmg23_7, %c0_i64_cmg23_7, %c128_i64_cmg23_7 + nburst(%c32_i64_cmg23_7, %c128_i64_cmg23_7, %c128_i64_cmg23_7) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + pto.mte_gm_ub %arg1, %ub_out_cmg23_7, %c0_i64_cmg23_7, %c128_i64_cmg23_7 + nburst(%c32_i64_cmg23_7, %c128_i64_cmg23_7, %c128_i64_cmg23_7) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + scf.for %offset_cmg23_7 = %c0_cmg23_7 to %c64_cmg23_7 step %c64_cmg23_7 { + %align0_cmg23_7 = pto.init_align : !pto.align + %vec_cmg23_7 = pto.vlds %ub_in_cmg23_7[%c0_cmg23_7] : !pto.ptr -> !pto.vreg<64xf32> + %align1_cmg23_7 = pto.vstus %align0_cmg23_7, %c3_i32_cmg23_7, %vec_cmg23_7, %ub_out_cmg23_7 + : !pto.align, i32, !pto.vreg<64xf32>, !pto.ptr -> !pto.align + pto.vstas %align1_cmg23_7, %ub_out_cmg23_7, %c3_i32_cmg23_7 : !pto.align, !pto.ptr, i32 + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_cmg23_7, %arg1, %c128_i64_cmg23_7 + nburst(%c32_i64_cmg23_7, %c128_i64_cmg23_7, %c128_i64_cmg23_7) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + } + + // inactive cross-case merge from micro-op/vector-load-store/vsts-pk-b64-f32 + scf.if %__case_merge_guard { + + %c0_cmg23_8 = arith.constant 0 : index + %c64_cmg23_8 = arith.constant 64 : index + %c1024_cmg23_8 = arith.constant 1024 : index + %c0_i64_cmg23_8 = arith.constant 0 : i64 + %c1_i64_cmg23_8 = arith.constant 1 : i64 + %c32_i64_cmg23_8 = arith.constant 32 : i64 + %c128_i64_cmg23_8 = arith.constant 128 : i64 + %c4096_i64_cmg23_8 = arith.constant 4096 : i64 + %c1024_i32_cmg23_8 = arith.constant 1024 : i32 + + %ub_in_cmg23_8 = pto.castptr %c0_i64_cmg23_8 : i64 -> !pto.ptr + %ub_out_cmg23_8 = pto.castptr %c4096_i64_cmg23_8 : i64 -> !pto.ptr + + %false_cmg23_8 = arith.constant false + pto.mte_gm_ub %arg0, %ub_in_cmg23_8, %c0_i64_cmg23_8, %c128_i64_cmg23_8 + nburst(%c32_i64_cmg23_8, %c128_i64_cmg23_8, %c128_i64_cmg23_8) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + pto.mte_gm_ub %arg1, %ub_out_cmg23_8, %c0_i64_cmg23_8, %c128_i64_cmg23_8 + nburst(%c32_i64_cmg23_8, %c128_i64_cmg23_8, %c128_i64_cmg23_8) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %__cmg23_8:1 = scf.for %offset_cmg23_8 = %c0_cmg23_8 to %c1024_cmg23_8 step %c64_cmg23_8 iter_args(%remaining_cmg23_8 = %c1024_i32_cmg23_8) -> (i32) { + %mask_cmg23_8, %next_remaining_cmg23_8 = pto.plt_b32 %remaining_cmg23_8 : i32 -> !pto.mask, i32 + %vec_cmg23_8 = pto.vlds %ub_in_cmg23_8[%offset_cmg23_8] : !pto.ptr -> !pto.vreg<64xf32> + pto.vsts %vec_cmg23_8, %ub_out_cmg23_8[%offset_cmg23_8], %mask_cmg23_8 {dist = "PK_B64"} : !pto.vreg<64xf32>, !pto.ptr, !pto.mask + scf.yield %next_remaining_cmg23_8 : i32 + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_cmg23_8, %arg1, %c128_i64_cmg23_8 + nburst(%c32_i64_cmg23_8, %c128_i64_cmg23_8, %c128_i64_cmg23_8) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + } + + // inactive cross-case merge from micro-op/vector-load-store/vstsx2-layout-check + scf.if %__case_merge_guard { + + %c0_cmg23_9 = arith.constant 0 : index + %c64_cmg23_9 = arith.constant 64 : index + %c1_i64_cmg23_9 = arith.constant 1 : i64 + %c0_i64_cmg23_9 = arith.constant 0 : i64 + %c32_i64_cmg23_9 = arith.constant 32 : i64 + %c128_i64_cmg23_9 = arith.constant 128 : i64 + %c4096_i64_cmg23_9 = arith.constant 4096 : i64 + %c64_i32_cmg23_9 = arith.constant 64 : i32 + %false_cmg23_9 = arith.constant false + + %ub_in_cmg23_9 = pto.castptr %c0_i64_cmg23_9 : i64 -> !pto.ptr + %ub_out_cmg23_9 = pto.castptr %c4096_i64_cmg23_9 : i64 -> !pto.ptr + pto.mte_gm_ub %arg0, %ub_in_cmg23_9, %c0_i64_cmg23_9, %c128_i64_cmg23_9 + nburst(%c32_i64_cmg23_9, %c128_i64_cmg23_9, %c128_i64_cmg23_9) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + %c1_cmg23_9 = arith.constant 1 : index + pto.vecscope { + scf.for %iv_cmg23_9 = %c0_cmg23_9 to %c1_cmg23_9 step %c1_cmg23_9 { + %mask_cmg23_9, %remaining_cmg23_9 = pto.plt_b32 %c64_i32_cmg23_9 : i32 -> !pto.mask, i32 + %x_cmg23_9 = pto.vlds %ub_in_cmg23_9[%c0_cmg23_9] : !pto.ptr -> !pto.vreg<64xf32> + %y_cmg23_9 = pto.vlds %ub_in_cmg23_9[%c64_cmg23_9] : !pto.ptr -> !pto.vreg<64xf32> + pto.vstsx2 %x_cmg23_9, %y_cmg23_9, %ub_out_cmg23_9[%c0_cmg23_9], "INTLV_B32", %mask_cmg23_9 + : !pto.vreg<64xf32>, !pto.vreg<64xf32>, !pto.ptr, index, + !pto.mask + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_cmg23_9, %arg1, %c128_i64_cmg23_9 + nburst(%c32_i64_cmg23_9, %c128_i64_cmg23_9, %c128_i64_cmg23_9) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + } return } } diff --git a/test/vpto/cases/micro-op/vector-load-store/vlds-brc-b8-f32/compare.py b/test/vpto/cases/micro-op/vector-load-store/vlds-brc-b8-f32/compare.py deleted file mode 100644 index 4c19eb038..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vlds-brc-b8-f32/compare.py +++ /dev/null @@ -1,40 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - - -import os -import sys - -import numpy as np - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(golden_path) or not os.path.exists(output_path): - return False - golden = np.fromfile(golden_path, dtype=dtype) - output = np.fromfile(output_path, dtype=dtype) - return golden.shape == output.shape and np.allclose( - golden, output, atol=eps, rtol=eps, equal_nan=True - ) - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_bin("golden_v2.bin", "v2.bin", np.float32, 0.0) - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/vector-load-store/vlds-brc-b8-f32/golden.py b/test/vpto/cases/micro-op/vector-load-store/vlds-brc-b8-f32/golden.py deleted file mode 100644 index be66bd820..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vlds-brc-b8-f32/golden.py +++ /dev/null @@ -1,56 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/vector-load-store/vlds-brc-b8-f32 -# family: vector-load-store -# target_ops: pto.vlds -# scenarios: core-f32, full-mask, aligned, dist-brc-b8, width-agnostic-dist - -import argparse -from pathlib import Path - -import numpy as np - - -ELEMENTS = 1024 -LANES = 64 -SEED = 19 - - -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) - v1 = rng.uniform(-8.0, 8.0, size=(ELEMENTS,)).astype(np.float32) - v2 = np.zeros((ELEMENTS,), dtype=np.float32) - - src_bytes = v1.view(np.uint8) - golden_bytes = np.zeros_like(src_bytes) - chunk_bytes = LANES * 4 - for offset in range(0, src_bytes.size, chunk_bytes): - pattern = src_bytes[offset] - golden_bytes[offset : offset + chunk_bytes] = pattern - golden_v2 = golden_bytes.view(np.float32) - - output_dir.mkdir(parents=True, exist_ok=True) - v1.tofile(output_dir / "v1.bin") - v2.tofile(output_dir / "v2.bin") - golden_v2.tofile(output_dir / "golden_v2.bin") - - -def main() -> None: - parser = argparse.ArgumentParser( - description="Generate numpy-based inputs/golden for VPTO micro-op vlds BRC_B8 on f32 validation." - ) - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/vector-load-store/vlds-brc-b8-f32/kernel.pto b/test/vpto/cases/micro-op/vector-load-store/vlds-brc-b8-f32/kernel.pto deleted file mode 100644 index fac3510de..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vlds-brc-b8-f32/kernel.pto +++ /dev/null @@ -1,47 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/vector-load-store/vlds-brc-b8-f32 -// family: vector-load-store -// target_ops: pto.vlds -// scenarios: core-f32, full-mask, aligned, dist-brc-b8, width-agnostic-dist -// ----------------------------------------------------------------------------- -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vlds_brc_b8_f32_kernel(%arg0: !pto.ptr, %arg1: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c64 = arith.constant 64 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c128_i64 = arith.constant 128 : i64 - %c4096_i64 = arith.constant 4096 : i64 - %c1024_i32 = arith.constant 1024 : i32 - - %ub_in = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c4096_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_in, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %_:1 = scf.for %offset = %c0 to %c1024 step %c64 iter_args(%remaining = %c1024_i32) -> (i32) { - %mask, %next_remaining = pto.plt_b32 %remaining : i32 -> !pto.mask, i32 - %out = pto.vlds %ub_in[%offset] {dist = "BRC_B8"} : !pto.ptr -> !pto.vreg<64xf32> - pto.vsts %out, %ub_out[%offset], %mask : !pto.vreg<64xf32>, !pto.ptr, !pto.mask - scf.yield %next_remaining : i32 - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg1, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/vector-load-store/vlds-brc-b8-f32/launch.cpp b/test/vpto/cases/micro-op/vector-load-store/vlds-brc-b8-f32/launch.cpp deleted file mode 100644 index 4628e9dc1..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vlds-brc-b8-f32/launch.cpp +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include - -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vlds_brc_b8_f32_kernel(__gm__ float *v1, - __gm__ float *v2); - -void LaunchVlds_brc_b8_f32_kernel(float *v1, float *v2, void *stream) { - vlds_brc_b8_f32_kernel<<<1, nullptr, stream>>>((__gm__ float *)v1, (__gm__ float *)v2); -} - diff --git a/test/vpto/cases/micro-op/vector-load-store/vlds-brc-b8-f32/main.cpp b/test/vpto/cases/micro-op/vector-load-store/vlds-brc-b8-f32/main.cpp deleted file mode 100644 index bf2d99510..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vlds-brc-b8-f32/main.cpp +++ /dev/null @@ -1,122 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -This program is free software, you can redistribute it and/or modify it under the terms and conditions of -CANN Open Software License Agreement Version 2.0 (the "License"). -Please refer to the License for details. You may not use this file except in compliance with the License. -THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -See LICENSE in the root of the software repository for the full text of the License. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include -#include - -using namespace PtoTestCommon; - -#ifndef TMRGSORT_HPP -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, (int)_ret, __FILE__, __LINE__); \ - const char *_recent = aclGetRecentErrMsg(); \ - if (_recent != nullptr && _recent[0] != '\0') { \ - std::fprintf(stderr, "[ERROR] RecentErrMsg: %s\n", _recent); \ - } \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVlds_brc_b8_f32_kernel(float *v1, float *v2, void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(float); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(float); - float *v1Host = nullptr; - float *v1Device = nullptr; - float *v2Host = nullptr; - float *v2Device = nullptr; - - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { - deviceId = std::atoi(envDevice); - } - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVlds_brc_b8_f32_kernel(v1Device, v2Device, stream); - - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v2Host, fileSize_v2, v2Device, fileSize_v2, ACL_MEMCPY_DEVICE_TO_HOST)); - - WriteFile("./v2.bin", v2Host, fileSize_v2); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - if (stream != nullptr) { - const aclError _ret = aclrtDestroyStream(stream); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtDestroyStream(stream)", (int)_ret, __FILE__, __LINE__); - } - stream = nullptr; - } - if (deviceSet) { - const aclError _ret = aclrtResetDevice(deviceId); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtResetDevice(deviceId)", (int)_ret, __FILE__, __LINE__); - } - } - if (aclInited) { - const aclError _ret = aclFinalize(); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclFinalize()", (int)_ret, __FILE__, __LINE__); - } - } - - return rc; -} diff --git a/test/vpto/cases/micro-op/vector-load-store/vlds-dma-loop/compare.py b/test/vpto/cases/micro-op/vector-load-store/vlds-dma-loop/compare.py deleted file mode 100644 index 81cfc5edb..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vlds-dma-loop/compare.py +++ /dev/null @@ -1,53 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -import os -import sys - -import numpy as np - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden.shape} vs {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - abs_diff = np.abs(golden.astype(np.float64) - output.astype(np.float64)) - idx = int(np.argmax(abs_diff)) - print( - f"[ERROR] Mismatch at idx={idx}: golden={golden[idx]}, out={output[idx]}, " - f"diff={abs_diff[idx]}" - ) - return False - return True - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_bin("golden_v2.bin", "v2.bin", np.float32, 1e-4) - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/vector-load-store/vlds-dma-loop/golden.py b/test/vpto/cases/micro-op/vector-load-store/vlds-dma-loop/golden.py deleted file mode 100644 index ee6929e7f..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vlds-dma-loop/golden.py +++ /dev/null @@ -1,47 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -import argparse -from pathlib import Path - -import numpy as np - - -ROWS = 8 -INPUT_COLS = 56 -OUTPUT_COLS = 64 -PAD_VALUE = 1.0 -SEED = 19 - - -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) - v1 = rng.uniform(-8.0, 8.0, size=(ROWS, INPUT_COLS)).astype(np.float32) - v2 = np.zeros((ROWS, OUTPUT_COLS), dtype=np.float32) - golden_v2 = np.full((ROWS, OUTPUT_COLS), PAD_VALUE, dtype=np.float32) - golden_v2[:, :INPUT_COLS] = v1 - - output_dir.mkdir(parents=True, exist_ok=True) - v1.reshape(-1).tofile(output_dir / "v1.bin") - v2.reshape(-1).tofile(output_dir / "v2.bin") - golden_v2.reshape(-1).tofile(output_dir / "golden_v2.bin") - - -def main() -> None: - parser = argparse.ArgumentParser( - description="Generate inputs/golden for VPTO micro-op vlds dma loop validation." - ) - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/vector-load-store/vlds-dma-loop/kernel.pto b/test/vpto/cases/micro-op/vector-load-store/vlds-dma-loop/kernel.pto deleted file mode 100644 index e452ea596..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vlds-dma-loop/kernel.pto +++ /dev/null @@ -1,67 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/vector-load-store/vlds-dma-loop -// family: vector-load-store -// target_ops: pto.vlds -// scenarios: core-f32, dma-loop-load-store, sw-loop-plus-hw-loop, full-mask, aligned, dist-norm -// ----------------------------------------------------------------------------- - -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vlds_dma_loop_kernel(%arg0: !pto.ptr, - %arg1: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c64 = arith.constant 64 : index - %c512 = arith.constant 512 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c2_i64 = arith.constant 2 : i64 - %c8_i64 = arith.constant 8 : i64 - %c224_i64 = arith.constant 224 : i64 - %c256_i64 = arith.constant 256 : i64 - %c448_i64 = arith.constant 448 : i64 - %c512_i64 = arith.constant 512 : i64 - %c896_i64 = arith.constant 896 : i64 - %c1024_i64 = arith.constant 1024 : i64 - %c2048_i64 = arith.constant 2048 : i64 - %c512_i32 = arith.constant 512 : i32 - %pad = arith.constant 1.000000e+00 : f32 - - %ub_in = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c2048_i64 : i64 -> !pto.ptr - - pto.mte_gm_ub %arg0, %ub_in, %c0_i64, %c224_i64 - nburst(%c1_i64, %c224_i64, %c256_i64) - loop(%c2_i64, %c224_i64, %c256_i64) - loop(%c2_i64, %c448_i64, %c512_i64) - loop(%c2_i64, %c896_i64, %c1024_i64) - pad(%pad, %c0_i64, %c8_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64, - loop i64, i64, i64, loop i64, i64, i64, loop i64, i64, i64, pad f32, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %_:1 = scf.for %offset = %c0 to %c512 step %c64 iter_args(%remaining = %c512_i32) -> (i32) { - %mask, %next_remaining = pto.plt_b32 %remaining : i32 -> !pto.mask, i32 - %value = pto.vlds %ub_in[%offset] {dist = "NORM"} : !pto.ptr -> !pto.vreg<64xf32> - pto.vsts %value, %ub_out[%offset], %mask : !pto.vreg<64xf32>, !pto.ptr, !pto.mask - scf.yield %next_remaining : i32 - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - - pto.mte_ub_gm %ub_out, %arg1, %c256_i64 - nburst(%c1_i64, %c256_i64, %c256_i64) - loop(%c2_i64, %c256_i64, %c256_i64) - loop(%c2_i64, %c512_i64, %c512_i64) - loop(%c2_i64, %c1024_i64, %c1024_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, - loop i64, i64, i64, loop i64, i64, i64, loop i64, i64, i64 - - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/vector-load-store/vlds-dma-loop/launch.cpp b/test/vpto/cases/micro-op/vector-load-store/vlds-dma-loop/launch.cpp deleted file mode 100644 index 3f59702eb..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vlds-dma-loop/launch.cpp +++ /dev/null @@ -1,41 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include - -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vlds_dma_loop_kernel(__gm__ float *v1, - __gm__ float *v2); - -void LaunchVlds_dma_loop_kernel(float *v1, float *v2, void *stream) { - vlds_dma_loop_kernel<<<1, nullptr, stream>>>((__gm__ float *)v1, - (__gm__ float *)v2); -} diff --git a/test/vpto/cases/micro-op/vector-load-store/vlds-dma-loop/main.cpp b/test/vpto/cases/micro-op/vector-load-store/vlds-dma-loop/main.cpp deleted file mode 100644 index 42d510325..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vlds-dma-loop/main.cpp +++ /dev/null @@ -1,103 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -#include "test_common.h" -#include "acl/acl.h" -#include -#include - -using namespace PtoTestCommon; - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, (int)_ret, __FILE__, __LINE__); \ - const char *_recent = aclGetRecentErrMsg(); \ - if (_recent != nullptr && _recent[0] != '\0') { \ - std::fprintf(stderr, "[ERROR] RecentErrMsg: %s\n", _recent); \ - } \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVlds_dma_loop_kernel(float *v1, float *v2, void *stream); - -int main() { - size_t elemCount_v1 = 448; - size_t fileSize_v1 = elemCount_v1 * sizeof(float); - size_t elemCount_v2 = 512; - size_t fileSize_v2 = elemCount_v2 * sizeof(float); - float *v1Host = nullptr; - float *v1Device = nullptr; - float *v2Host = nullptr; - float *v2Device = nullptr; - - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { - deviceId = std::atoi(envDevice); - } - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, ACL_MEMCPY_HOST_TO_DEVICE)); - - LaunchVlds_dma_loop_kernel(v1Device, v2Device, stream); - - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v2Host, fileSize_v2, v2Device, fileSize_v2, ACL_MEMCPY_DEVICE_TO_HOST)); - - WriteFile("./v2.bin", v2Host, fileSize_v2); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - if (stream != nullptr) { - const aclError _ret = aclrtDestroyStream(stream); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtDestroyStream(stream)", (int)_ret, __FILE__, __LINE__); - } - stream = nullptr; - } - if (deviceSet) { - const aclError _ret = aclrtResetDevice(deviceId); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtResetDevice(deviceId)", (int)_ret, __FILE__, __LINE__); - } - } - if (aclInited) { - const aclError _ret = aclFinalize(); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclFinalize()", (int)_ret, __FILE__, __LINE__); - } - } - - return rc; -} diff --git a/test/vpto/cases/micro-op/vector-load-store/vlds-ds-b16/compare.py b/test/vpto/cases/micro-op/vector-load-store/vlds-ds-b16/compare.py deleted file mode 100755 index e558d22f2..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vlds-ds-b16/compare.py +++ /dev/null @@ -1,210 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/vector-load-store/vlds-ds-b16 -# family: vector-load-store -# target_ops: pto.vlds -# scenarios: core-i16, full-mask, aligned, dist-ds-b16 -# NOTE: bulk-generated coverage skeleton. -# coding=utf-8 - -import os -import sys -import numpy as np - - -REPEAT_BYTES = 256 -PREFIX_ELEMS = 1024 - - -def _ceil_div(x, y): - return (x + y - 1) // y - - -def _packed_pred_storage_bytes(logical_elems, src_elem_bytes): - logical_elems = int(logical_elems) - src_elem_bytes = int(src_elem_bytes) - if logical_elems <= 0: - raise ValueError(f"logical_elems must be > 0, got {logical_elems}") - if src_elem_bytes not in (1, 2, 4): - raise ValueError(f"unsupported packed predicate source size: {src_elem_bytes}") - - repeat_elems = REPEAT_BYTES // src_elem_bytes - if src_elem_bytes == 4: - repeat_times = _ceil_div(logical_elems, repeat_elems) + 1 - loop_count = repeat_times // 2 - return loop_count * 16 - - repeat_times = _ceil_div(logical_elems, repeat_elems) - return repeat_times * (repeat_elems // 8) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - g = golden.astype(np.int64, copy=False) - o = output.astype(np.int64, copy=False) - else: - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - abs_diff = np.abs(g - o) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={g[idx]}, out={o[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_bin_prefix(golden_path, output_path, dtype, eps, count): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - try: - count = int(count) - except Exception: - print(f"[ERROR] Invalid prefix count: {count}") - return False - if count <= 0: - print(f"[ERROR] Invalid prefix count: {count}") - return False - - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np, count=count) - output = np.fromfile(output_path, dtype=dtype_np, count=count) - - if golden.size != count or output.size != count: - print( - f"[ERROR] Prefix read too small: need={count} elems, " - f"golden={golden.size}, out={output.size}" - ) - return False - - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - g = golden.astype(np.int64, copy=False) - o = output.astype(np.int64, copy=False) - else: - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - abs_diff = np.abs(g - o) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch (prefix): {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={g[idx]}, out={o[idx]}, dtype={dtype_np}, count={count})" - ) - else: - print(f"[ERROR] Mismatch (prefix): {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, logical_elems, src_elem_bytes): - """ - Compare outputs of pto.tcmp / pto.tcmps. - - PTO-ISA stores packed predicate results as a linear PK byte stream via - `psts`, with the exact written prefix length determined by the typed - TCMP/TCMPS repeat schedule. Compare only that semantic prefix. - """ - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - try: - logical_elems = int(logical_elems) - src_elem_bytes = int(src_elem_bytes) - except Exception: - print( - "[ERROR] Invalid packed mask compare arguments: " - f"logical_elems={logical_elems} src_elem_bytes={src_elem_bytes}" - ) - return False - if logical_elems <= 0 or src_elem_bytes <= 0: - print( - "[ERROR] Invalid packed mask compare arguments: " - f"logical_elems={logical_elems} src_elem_bytes={src_elem_bytes}" - ) - return False - - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - try: - prefix_bytes = _packed_pred_storage_bytes(logical_elems, src_elem_bytes) - except ValueError as exc: - print(f"[ERROR] {exc}") - return False - - if golden.size < prefix_bytes or output.size < prefix_bytes: - print( - f"[ERROR] Packed mask buffer too small: need={prefix_bytes} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - - golden_sel = golden[:prefix_bytes] - output_sel = output[:prefix_bytes] - - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, " - f"logical_elems={logical_elems}, src_elem_bytes={src_elem_bytes}, prefix_bytes={prefix_bytes})" - ) - return False - return True - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = True - ok = compare_bin_prefix("golden_v2.bin", "v2.bin", np.int16, 0.0, PREFIX_ELEMS) and ok - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/vector-load-store/vlds-ds-b16/golden.py b/test/vpto/cases/micro-op/vector-load-store/vlds-ds-b16/golden.py deleted file mode 100755 index 63da9f605..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vlds-ds-b16/golden.py +++ /dev/null @@ -1,55 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/vector-load-store/vlds-ds-b16 -# family: vector-load-store -# target_ops: pto.vlds -# scenarios: core-i16, full-mask, aligned, dist-ds-b16 -# NOTE: DS on b16 keeps every other i16 element from a 256-element source window. -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -ELEMENTS = 2048 -ACTIVE_ELEMS = 1024 -LANES = 128 -SOURCE_WINDOW = 256 -SEED = 19 - - -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) - v1 = rng.integers(-(2**15), 2**15, size=(ELEMENTS,), dtype=np.int16) - v2 = np.zeros((ELEMENTS,), dtype=np.int16) - golden_v2 = np.zeros((ELEMENTS,), dtype=np.int16) - for offset in range(0, ACTIVE_ELEMS, LANES): - golden_v2[offset : offset + LANES] = v1[offset : offset + SOURCE_WINDOW : 2][:LANES] - - output_dir.mkdir(parents=True, exist_ok=True) - v1.tofile(output_dir / "v1.bin") - v2.tofile(output_dir / "v2.bin") - golden_v2.tofile(output_dir / "golden_v2.bin") - - -def main() -> None: - parser = argparse.ArgumentParser( - description="Generate numpy-based inputs/golden for VPTO micro-op vlds b16 downsample validation." - ) - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/vector-load-store/vlds-ds-b16/kernel.pto b/test/vpto/cases/micro-op/vector-load-store/vlds-ds-b16/kernel.pto deleted file mode 100644 index c9dedb3c7..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vlds-ds-b16/kernel.pto +++ /dev/null @@ -1,50 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/vector-load-store/vlds-ds-b16 -// family: vector-load-store -// target_ops: pto.vlds -// scenarios: core-i16, full-mask, aligned, dist-ds-b16 -// ----------------------------------------------------------------------------- -// Validate one representative `DS_B16` load on `b16`. - -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vabs_kernel_2d(%arg0: !pto.ptr, %arg1: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c128 = arith.constant 128 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c128_i64 = arith.constant 128 : i64 - %c4096_i64 = arith.constant 4096 : i64 - %c1024_i32 = arith.constant 1024 : i32 - - %ub_in = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c4096_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_in, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %_:1 = scf.for %offset = %c0 to %c1024 step %c128 iter_args(%remaining = %c1024_i32) -> (i32) { - %mask, %next_remaining = pto.plt_b16 %remaining : i32 -> !pto.mask, i32 - %out = pto.vlds %ub_in[%offset] {dist = "DS_B16"} : !pto.ptr -> !pto.vreg<128xi16> - pto.vsts %out, %ub_out[%offset], %mask : !pto.vreg<128xi16>, !pto.ptr, !pto.mask - scf.yield %next_remaining : i32 - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg1, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/vector-load-store/vlds-ds-b16/launch.cpp b/test/vpto/cases/micro-op/vector-load-store/vlds-ds-b16/launch.cpp deleted file mode 100644 index 07ccb8b8d..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vlds-ds-b16/launch.cpp +++ /dev/null @@ -1,70 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/vector-load-store/vlds-ds-b16 -// family: vector-load-store -// target_ops: pto.vlds -// scenarios: core-i16, full-mask, aligned, dist-ds-b16 -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -// --------------------------------------------------------------------------- -// PTOAS compatibility layer -// -// The upstream pto-isa headers reference some FP8/FP4 types and the -// __VEC_SCOPE__ marker that are not available on every AICore arch/toolchain -// combination (e.g. __NPU_ARCH__==2201). -// -// For our PTOAS-generated kernels we don't rely on these types today, but the -// headers still mention them in templates/static_asserts. Provide minimal -// fallbacks to keep compilation working on dav-c220. -// --------------------------------------------------------------------------- -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include - -// AICore printf support is gated behind `--cce-enable-print` on some -// toolchains. When enabled, include the CCE print header so `cce::printf` -// resolves in device compilation. -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -// Some PTO-ISA types are only available in the __CCE_AICORE__ compilation -// path, but `bisheng -xcce` still performs a host-side parse pass. -// Provide minimal fallbacks only when the corresponding header wasn't -// pulled in by the selected arch implementation. -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vabs_kernel_2d(__gm__ float *v1, - __gm__ float *v2); - -void LaunchVabs_kernel_2d(float *v1, float *v2, void *stream) { - vabs_kernel_2d<<<1, nullptr, stream>>>((__gm__ float *)v1, (__gm__ float *)v2); -} diff --git a/test/vpto/cases/micro-op/vector-load-store/vlds-ds-b16/main.cpp b/test/vpto/cases/micro-op/vector-load-store/vlds-ds-b16/main.cpp deleted file mode 100644 index 951256acb..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vlds-ds-b16/main.cpp +++ /dev/null @@ -1,130 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/vector-load-store/vlds-ds-b16 -// family: vector-load-store -// target_ops: pto.vlds -// scenarios: core-i16, full-mask, aligned, dist-ds-b16 -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -This program is free software, you can redistribute it and/or modify it under the terms and conditions of -CANN Open Software License Agreement Version 2.0 (the "License"). -Please refer to the License for details. You may not use this file except in compliance with the License. -THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -See LICENSE in the root of the software repository for the full text of the License. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include -#include - -using namespace PtoTestCommon; - -#ifndef TMRGSORT_HPP -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, (int)_ret, __FILE__, __LINE__); \ - const char *_recent = aclGetRecentErrMsg(); \ - if (_recent != nullptr && _recent[0] != '\0') { \ - std::fprintf(stderr, "[ERROR] RecentErrMsg: %s\n", _recent); \ - } \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVabs_kernel_2d(float *v1, float *v2, void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(float); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(float); - float *v1Host = nullptr; - float *v1Device = nullptr; - float *v2Host = nullptr; - float *v2Device = nullptr; - - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { - deviceId = std::atoi(envDevice); - } - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVabs_kernel_2d(v1Device, v2Device, stream); - - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v2Host, fileSize_v2, v2Device, fileSize_v2, ACL_MEMCPY_DEVICE_TO_HOST)); - - WriteFile("./v2.bin", v2Host, fileSize_v2); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - if (stream != nullptr) { - const aclError _ret = aclrtDestroyStream(stream); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtDestroyStream(stream)", (int)_ret, __FILE__, __LINE__); - } - stream = nullptr; - } - if (deviceSet) { - const aclError _ret = aclrtResetDevice(deviceId); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtResetDevice(deviceId)", (int)_ret, __FILE__, __LINE__); - } - } - if (aclInited) { - const aclError _ret = aclFinalize(); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclFinalize()", (int)_ret, __FILE__, __LINE__); - } - } - - return rc; -} diff --git a/test/vpto/cases/micro-op/vector-load-store/vlds-post-update/kernel.pto b/test/vpto/cases/micro-op/vector-load-store/vlds-post-update/kernel.pto index 412096fe8..d762cbdf0 100644 --- a/test/vpto/cases/micro-op/vector-load-store/vlds-post-update/kernel.pto +++ b/test/vpto/cases/micro-op/vector-load-store/vlds-post-update/kernel.pto @@ -50,6 +50,87 @@ module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind, !pto.ptr, i64, i64, i64, i64 pto.barrier #pto.pipe + %__case_merge_guard = arith.constant false + + // inactive cross-case merge from micro-op/vector-load-store/vsstb-post-update + scf.if %__case_merge_guard { + + %c0_cmpost_1 = arith.constant 0 : index + %c0_i64_cmpost_1 = arith.constant 0 : i64 + %c2_i16_cmpost_1 = arith.constant 2 : i16 + %c4_i16_cmpost_1 = arith.constant 4 : i16 + %c32_i64_cmpost_1 = arith.constant 32 : i64 + %c64_i32_cmpost_1 = arith.constant 64 : i32 + %c128_i64_cmpost_1 = arith.constant 128 : i64 + %c4096_i64_cmpost_1 = arith.constant 4096 : i64 + + %ub_in_cmpost_1 = pto.castptr %c0_i64_cmpost_1 : i64 -> !pto.ptr + %ub_out_cmpost_1 = pto.castptr %c4096_i64_cmpost_1 : i64 -> !pto.ptr + + pto.mte_gm_ub %input, %ub_in_cmpost_1, %c0_i64_cmpost_1, %c128_i64_cmpost_1 + nburst(%c32_i64_cmpost_1, %c128_i64_cmpost_1, %c128_i64_cmpost_1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %mask_cmpost_1, %next_remaining_cmpost_1 = pto.plt_b32 %c64_i32_cmpost_1 : i32 -> !pto.mask, i32 + %value_cmpost_1 = pto.vlds %ub_in_cmpost_1[%c0_cmpost_1] : !pto.ptr -> !pto.vreg<64xf32> + %updated_cmpost_1 = pto.vsstb %value_cmpost_1, %ub_out_cmpost_1, %c2_i16_cmpost_1, %c4_i16_cmpost_1, %mask_cmpost_1 : !pto.vreg<64xf32>, !pto.ptr, i16, i16, !pto.mask -> !pto.ptr + pto.mem_bar "VST_VLD" + %roundtrip_cmpost_1 = pto.vsldb %updated_cmpost_1, %c2_i16_cmpost_1, %c4_i16_cmpost_1, %mask_cmpost_1 : !pto.ptr, i16, i16, !pto.mask -> !pto.vreg<64xf32> + pto.vsts %roundtrip_cmpost_1, %ub_in_cmpost_1[%c0_cmpost_1], %mask_cmpost_1 : !pto.vreg<64xf32>, !pto.ptr, !pto.mask + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_in_cmpost_1, %output, %c128_i64_cmpost_1 + nburst(%c32_i64_cmpost_1, %c128_i64_cmpost_1, %c128_i64_cmpost_1) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + } + + // inactive cross-case merge from micro-op/vector-load-store/vsts-post-update + scf.if %__case_merge_guard { + + %c0_cmpost_2 = arith.constant 0 : index + %c64_cmpost_2 = arith.constant 64 : index + %c1024_cmpost_2 = arith.constant 1024 : index + %c0_i64_cmpost_2 = arith.constant 0 : i64 + %c32_i64_cmpost_2 = arith.constant 32 : i64 + %c128_i64_cmpost_2 = arith.constant 128 : i64 + %c4096_i64_cmpost_2 = arith.constant 4096 : i64 + %c1024_i32_cmpost_2 = arith.constant 1024 : i32 + + %ub_in_cmpost_2 = pto.castptr %c0_i64_cmpost_2 : i64 -> !pto.ptr + %ub_out_cmpost_2 = pto.castptr %c4096_i64_cmpost_2 : i64 -> !pto.ptr + + pto.mte_gm_ub %input, %ub_in_cmpost_2, %c0_i64_cmpost_2, %c128_i64_cmpost_2 + nburst(%c32_i64_cmpost_2, %c128_i64_cmpost_2, %c128_i64_cmpost_2) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 + + pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] + + pto.vecscope { + %__cmpost_2:2 = scf.for %offset_cmpost_2 = %c0_cmpost_2 to %c1024_cmpost_2 step %c64_cmpost_2 + iter_args(%remaining_cmpost_2 = %c1024_i32_cmpost_2, %dst_cmpost_2 = %ub_out_cmpost_2) + -> (i32, !pto.ptr) { + %mask_cmpost_2, %next_remaining_cmpost_2 = pto.plt_b32 %remaining_cmpost_2 : i32 -> !pto.mask, i32 + %vec_cmpost_2 = pto.vlds %ub_in_cmpost_2[%offset_cmpost_2] : !pto.ptr -> !pto.vreg<64xf32> + %updated_cmpost_2 = pto.vsts %vec_cmpost_2, %dst_cmpost_2[%c64_cmpost_2], %mask_cmpost_2 {dist = "NORM_B32"} : !pto.vreg<64xf32>, !pto.ptr, !pto.mask -> !pto.ptr + scf.yield %next_remaining_cmpost_2, %updated_cmpost_2 : i32, !pto.ptr + } + } + + pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] + pto.mte_ub_gm %ub_out_cmpost_2, %output, %c128_i64_cmpost_2 + nburst(%c32_i64_cmpost_2, %c128_i64_cmpost_2, %c128_i64_cmpost_2) + : !pto.ptr, !pto.ptr, i64, i64, i64, i64 + pto.barrier #pto.pipe + } return } } diff --git a/test/vpto/cases/micro-op/vector-load-store/vlds-tail/compare.py b/test/vpto/cases/micro-op/vector-load-store/vlds-tail/compare.py deleted file mode 100755 index 1f3503d9b..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vlds-tail/compare.py +++ /dev/null @@ -1,210 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/vector-load-store/vlds-tail -# family: vector-load-store -# target_ops: pto.vlds -# scenarios: core-f32, contiguous, tail-mask, aligned, dist-norm -# NOTE: bulk-generated coverage skeleton. -# coding=utf-8 - -import os -import sys -import numpy as np - - -REPEAT_BYTES = 256 -PREFIX_ELEMS = 1000 - - -def _ceil_div(x, y): - return (x + y - 1) // y - - -def _packed_pred_storage_bytes(logical_elems, src_elem_bytes): - logical_elems = int(logical_elems) - src_elem_bytes = int(src_elem_bytes) - if logical_elems <= 0: - raise ValueError(f"logical_elems must be > 0, got {logical_elems}") - if src_elem_bytes not in (1, 2, 4): - raise ValueError(f"unsupported packed predicate source size: {src_elem_bytes}") - - repeat_elems = REPEAT_BYTES // src_elem_bytes - if src_elem_bytes == 4: - repeat_times = _ceil_div(logical_elems, repeat_elems) + 1 - loop_count = repeat_times // 2 - return loop_count * 16 - - repeat_times = _ceil_div(logical_elems, repeat_elems) - return repeat_times * (repeat_elems // 8) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - g = golden.astype(np.int64, copy=False) - o = output.astype(np.int64, copy=False) - else: - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - abs_diff = np.abs(g - o) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={g[idx]}, out={o[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_bin_prefix(golden_path, output_path, dtype, eps, count): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - try: - count = int(count) - except Exception: - print(f"[ERROR] Invalid prefix count: {count}") - return False - if count <= 0: - print(f"[ERROR] Invalid prefix count: {count}") - return False - - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np, count=count) - output = np.fromfile(output_path, dtype=dtype_np, count=count) - - if golden.size != count or output.size != count: - print( - f"[ERROR] Prefix read too small: need={count} elems, " - f"golden={golden.size}, out={output.size}" - ) - return False - - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - g = golden.astype(np.int64, copy=False) - o = output.astype(np.int64, copy=False) - else: - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - abs_diff = np.abs(g - o) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch (prefix): {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={g[idx]}, out={o[idx]}, dtype={dtype_np}, count={count})" - ) - else: - print(f"[ERROR] Mismatch (prefix): {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, logical_elems, src_elem_bytes): - """ - Compare outputs of pto.tcmp / pto.tcmps. - - PTO-ISA stores packed predicate results as a linear PK byte stream via - `psts`, with the exact written prefix length determined by the typed - TCMP/TCMPS repeat schedule. Compare only that semantic prefix. - """ - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - try: - logical_elems = int(logical_elems) - src_elem_bytes = int(src_elem_bytes) - except Exception: - print( - "[ERROR] Invalid packed mask compare arguments: " - f"logical_elems={logical_elems} src_elem_bytes={src_elem_bytes}" - ) - return False - if logical_elems <= 0 or src_elem_bytes <= 0: - print( - "[ERROR] Invalid packed mask compare arguments: " - f"logical_elems={logical_elems} src_elem_bytes={src_elem_bytes}" - ) - return False - - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - try: - prefix_bytes = _packed_pred_storage_bytes(logical_elems, src_elem_bytes) - except ValueError as exc: - print(f"[ERROR] {exc}") - return False - - if golden.size < prefix_bytes or output.size < prefix_bytes: - print( - f"[ERROR] Packed mask buffer too small: need={prefix_bytes} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - - golden_sel = golden[:prefix_bytes] - output_sel = output[:prefix_bytes] - - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, " - f"logical_elems={logical_elems}, src_elem_bytes={src_elem_bytes}, prefix_bytes={prefix_bytes})" - ) - return False - return True - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = True - ok = compare_bin_prefix("golden_v2.bin", "v2.bin", np.float32, 0.0001, PREFIX_ELEMS) and ok - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/vector-load-store/vlds-tail/golden.py b/test/vpto/cases/micro-op/vector-load-store/vlds-tail/golden.py deleted file mode 100755 index 7eca6a3ad..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vlds-tail/golden.py +++ /dev/null @@ -1,53 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/vector-load-store/vlds-tail -# family: vector-load-store -# target_ops: pto.vlds -# scenarios: core-f32, contiguous, tail-mask, aligned, dist-norm -# NOTE: tail-mask case writes the first 1000 f32 lanes and leaves the -# remaining lanes zero. -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -ELEMENTS = 1024 -LOGICAL_ELEMS = 1000 -SEED = 19 - - -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) - v1 = rng.uniform(-8.0, 8.0, size=(ELEMENTS,)).astype(np.float32) - v2 = np.zeros((ELEMENTS,), dtype=np.float32) - golden_v2 = np.zeros((ELEMENTS,), dtype=np.float32) - golden_v2[:LOGICAL_ELEMS] = v1[:LOGICAL_ELEMS] - - output_dir.mkdir(parents=True, exist_ok=True) - v1.tofile(output_dir / "v1.bin") - v2.tofile(output_dir / "v2.bin") - golden_v2.tofile(output_dir / "golden_v2.bin") - - -def main() -> None: - parser = argparse.ArgumentParser( - description="Generate numpy-based inputs/golden for VPTO micro-op vlds tail validation." - ) - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/vector-load-store/vlds-tail/kernel.pto b/test/vpto/cases/micro-op/vector-load-store/vlds-tail/kernel.pto deleted file mode 100644 index 8627110bb..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vlds-tail/kernel.pto +++ /dev/null @@ -1,70 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/vector-load-store/vlds-tail -// family: vector-load-store -// target_ops: pto.vlds -// scenarios: core-f32, contiguous, tail-mask, aligned, dist-norm -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -// ============================================================================= -// abs_kernel_2d: Element-wise absolute value on a 32x32 f32 tile -// ============================================================================= -// This kernel computes abs(input) for a 32x32 float32 matrix (1024 elements). -// -// Memory Layout: -// - Input: arg0 -> GM (Global Memory) -// - Output: arg1 -> GM (Global Memory) -// - UB (Unified Buffer) at offset 0: input tile (4096 bytes = 32*32*4) -// - UB at offset 4096: output tile (4096 bytes = 32*32*4) -// -// Pipeline: -// 1. DMA: GM -> UB (MTE2 pipe) - copy input tile to UB -// 2. Sync: wait for MTE2 -> V pipe handoff -// 3. Compute: vabs on 64-element vectors (V pipe) - 16 iterations for 1024 elements -// 4. Sync: wait for V -> MTE3 pipe handoff -// 5. DMA: UB -> GM (MTE3 pipe) - copy result tile back to GM -// ============================================================================= - -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vabs_kernel_2d(%arg0: !pto.ptr, %arg1: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c64 = arith.constant 64 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c128_i64 = arith.constant 128 : i64 - %c4096_i64 = arith.constant 4096 : i64 - %c1000_i32 = arith.constant 1000 : i32 - %c1024_i32 = arith.constant 1024 : i32 - - %ub_in = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c4096_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_in, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %_:1 = scf.for %offset = %c0 to %c1024 step %c64 iter_args(%remaining = %c1000_i32) -> (i32) { - %mask, %next_remaining = pto.plt_b32 %remaining : i32 -> !pto.mask, i32 - %out = pto.vlds %ub_in[%offset] {dist = "NORM"} : !pto.ptr -> !pto.vreg<64xf32> - pto.vsts %out, %ub_out[%offset], %mask : !pto.vreg<64xf32>, !pto.ptr, !pto.mask - scf.yield %next_remaining : i32 - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg1, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/vector-load-store/vlds-tail/launch.cpp b/test/vpto/cases/micro-op/vector-load-store/vlds-tail/launch.cpp deleted file mode 100644 index dfbca2f61..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vlds-tail/launch.cpp +++ /dev/null @@ -1,70 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/vector-load-store/vlds-tail -// family: vector-load-store -// target_ops: pto.vlds -// scenarios: core-f32, contiguous, tail-mask, aligned, dist-norm -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -// --------------------------------------------------------------------------- -// PTOAS compatibility layer -// -// The upstream pto-isa headers reference some FP8/FP4 types and the -// __VEC_SCOPE__ marker that are not available on every AICore arch/toolchain -// combination (e.g. __NPU_ARCH__==2201). -// -// For our PTOAS-generated kernels we don't rely on these types today, but the -// headers still mention them in templates/static_asserts. Provide minimal -// fallbacks to keep compilation working on dav-c220. -// --------------------------------------------------------------------------- -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include - -// AICore printf support is gated behind `--cce-enable-print` on some -// toolchains. When enabled, include the CCE print header so `cce::printf` -// resolves in device compilation. -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -// Some PTO-ISA types are only available in the __CCE_AICORE__ compilation -// path, but `bisheng -xcce` still performs a host-side parse pass. -// Provide minimal fallbacks only when the corresponding header wasn't -// pulled in by the selected arch implementation. -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vabs_kernel_2d(__gm__ float *v1, - __gm__ float *v2); - -void LaunchVabs_kernel_2d(float *v1, float *v2, void *stream) { - vabs_kernel_2d<<<1, nullptr, stream>>>((__gm__ float *)v1, (__gm__ float *)v2); -} diff --git a/test/vpto/cases/micro-op/vector-load-store/vlds-tail/main.cpp b/test/vpto/cases/micro-op/vector-load-store/vlds-tail/main.cpp deleted file mode 100644 index a9f049135..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vlds-tail/main.cpp +++ /dev/null @@ -1,130 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/vector-load-store/vlds-tail -// family: vector-load-store -// target_ops: pto.vlds -// scenarios: core-f32, contiguous, tail-mask, aligned, dist-norm -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -This program is free software, you can redistribute it and/or modify it under the terms and conditions of -CANN Open Software License Agreement Version 2.0 (the "License"). -Please refer to the License for details. You may not use this file except in compliance with the License. -THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -See LICENSE in the root of the software repository for the full text of the License. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include -#include - -using namespace PtoTestCommon; - -#ifndef TMRGSORT_HPP -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, (int)_ret, __FILE__, __LINE__); \ - const char *_recent = aclGetRecentErrMsg(); \ - if (_recent != nullptr && _recent[0] != '\0') { \ - std::fprintf(stderr, "[ERROR] RecentErrMsg: %s\n", _recent); \ - } \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVabs_kernel_2d(float *v1, float *v2, void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(float); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(float); - float *v1Host = nullptr; - float *v1Device = nullptr; - float *v2Host = nullptr; - float *v2Device = nullptr; - - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { - deviceId = std::atoi(envDevice); - } - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVabs_kernel_2d(v1Device, v2Device, stream); - - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v2Host, fileSize_v2, v2Device, fileSize_v2, ACL_MEMCPY_DEVICE_TO_HOST)); - - WriteFile("./v2.bin", v2Host, fileSize_v2); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - if (stream != nullptr) { - const aclError _ret = aclrtDestroyStream(stream); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtDestroyStream(stream)", (int)_ret, __FILE__, __LINE__); - } - stream = nullptr; - } - if (deviceSet) { - const aclError _ret = aclrtResetDevice(deviceId); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtResetDevice(deviceId)", (int)_ret, __FILE__, __LINE__); - } - } - if (aclInited) { - const aclError _ret = aclFinalize(); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclFinalize()", (int)_ret, __FILE__, __LINE__); - } - } - - return rc; -} diff --git a/test/vpto/cases/micro-op/vector-load-store/vlds-us-b16/compare.py b/test/vpto/cases/micro-op/vector-load-store/vlds-us-b16/compare.py deleted file mode 100755 index 5ccc50a39..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vlds-us-b16/compare.py +++ /dev/null @@ -1,210 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/vector-load-store/vlds-us-b16 -# family: vector-load-store -# target_ops: pto.vlds -# scenarios: core-i16, full-mask, aligned, dist-us-b16 -# NOTE: bulk-generated coverage skeleton. -# coding=utf-8 - -import os -import sys -import numpy as np - - -REPEAT_BYTES = 256 -PREFIX_ELEMS = 1024 - - -def _ceil_div(x, y): - return (x + y - 1) // y - - -def _packed_pred_storage_bytes(logical_elems, src_elem_bytes): - logical_elems = int(logical_elems) - src_elem_bytes = int(src_elem_bytes) - if logical_elems <= 0: - raise ValueError(f"logical_elems must be > 0, got {logical_elems}") - if src_elem_bytes not in (1, 2, 4): - raise ValueError(f"unsupported packed predicate source size: {src_elem_bytes}") - - repeat_elems = REPEAT_BYTES // src_elem_bytes - if src_elem_bytes == 4: - repeat_times = _ceil_div(logical_elems, repeat_elems) + 1 - loop_count = repeat_times // 2 - return loop_count * 16 - - repeat_times = _ceil_div(logical_elems, repeat_elems) - return repeat_times * (repeat_elems // 8) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - g = golden.astype(np.int64, copy=False) - o = output.astype(np.int64, copy=False) - else: - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - abs_diff = np.abs(g - o) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={g[idx]}, out={o[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_bin_prefix(golden_path, output_path, dtype, eps, count): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - try: - count = int(count) - except Exception: - print(f"[ERROR] Invalid prefix count: {count}") - return False - if count <= 0: - print(f"[ERROR] Invalid prefix count: {count}") - return False - - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np, count=count) - output = np.fromfile(output_path, dtype=dtype_np, count=count) - - if golden.size != count or output.size != count: - print( - f"[ERROR] Prefix read too small: need={count} elems, " - f"golden={golden.size}, out={output.size}" - ) - return False - - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - g = golden.astype(np.int64, copy=False) - o = output.astype(np.int64, copy=False) - else: - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - abs_diff = np.abs(g - o) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch (prefix): {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={g[idx]}, out={o[idx]}, dtype={dtype_np}, count={count})" - ) - else: - print(f"[ERROR] Mismatch (prefix): {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, logical_elems, src_elem_bytes): - """ - Compare outputs of pto.tcmp / pto.tcmps. - - PTO-ISA stores packed predicate results as a linear PK byte stream via - `psts`, with the exact written prefix length determined by the typed - TCMP/TCMPS repeat schedule. Compare only that semantic prefix. - """ - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - try: - logical_elems = int(logical_elems) - src_elem_bytes = int(src_elem_bytes) - except Exception: - print( - "[ERROR] Invalid packed mask compare arguments: " - f"logical_elems={logical_elems} src_elem_bytes={src_elem_bytes}" - ) - return False - if logical_elems <= 0 or src_elem_bytes <= 0: - print( - "[ERROR] Invalid packed mask compare arguments: " - f"logical_elems={logical_elems} src_elem_bytes={src_elem_bytes}" - ) - return False - - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - try: - prefix_bytes = _packed_pred_storage_bytes(logical_elems, src_elem_bytes) - except ValueError as exc: - print(f"[ERROR] {exc}") - return False - - if golden.size < prefix_bytes or output.size < prefix_bytes: - print( - f"[ERROR] Packed mask buffer too small: need={prefix_bytes} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - - golden_sel = golden[:prefix_bytes] - output_sel = output[:prefix_bytes] - - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, " - f"logical_elems={logical_elems}, src_elem_bytes={src_elem_bytes}, prefix_bytes={prefix_bytes})" - ) - return False - return True - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = True - ok = compare_bin_prefix("golden_v2.bin", "v2.bin", np.int16, 0.0, PREFIX_ELEMS) and ok - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/vector-load-store/vlds-us-b16/golden.py b/test/vpto/cases/micro-op/vector-load-store/vlds-us-b16/golden.py deleted file mode 100755 index 214b0269c..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vlds-us-b16/golden.py +++ /dev/null @@ -1,55 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/vector-load-store/vlds-us-b16 -# family: vector-load-store -# target_ops: pto.vlds -# scenarios: core-i16, full-mask, aligned, dist-us-b16 -# NOTE: US on b16 duplicates each source i16 element into two consecutive lanes. -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -ELEMENTS = 2048 -ACTIVE_ELEMS = 1024 -LANES = 128 -SEED = 19 - - -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) - v1 = rng.integers(-(2**15), 2**15, size=(ELEMENTS,), dtype=np.int16) - v2 = np.zeros((ELEMENTS,), dtype=np.int16) - golden_v2 = np.zeros((ELEMENTS,), dtype=np.int16) - half_lanes = LANES // 2 - for offset in range(0, ACTIVE_ELEMS, LANES): - golden_v2[offset : offset + LANES] = np.repeat(v1[offset : offset + half_lanes], 2) - - output_dir.mkdir(parents=True, exist_ok=True) - v1.tofile(output_dir / "v1.bin") - v2.tofile(output_dir / "v2.bin") - golden_v2.tofile(output_dir / "golden_v2.bin") - - -def main() -> None: - parser = argparse.ArgumentParser( - description="Generate numpy-based inputs/golden for VPTO micro-op vlds b16 upsample validation." - ) - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/vector-load-store/vlds-us-b16/kernel.pto b/test/vpto/cases/micro-op/vector-load-store/vlds-us-b16/kernel.pto deleted file mode 100644 index df43097fb..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vlds-us-b16/kernel.pto +++ /dev/null @@ -1,50 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/vector-load-store/vlds-us-b16 -// family: vector-load-store -// target_ops: pto.vlds -// scenarios: core-i16, full-mask, aligned, dist-us-b16 -// ----------------------------------------------------------------------------- -// Validate one representative `US_B16` load on `b16`. - -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vabs_kernel_2d(%arg0: !pto.ptr, %arg1: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c128 = arith.constant 128 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c128_i64 = arith.constant 128 : i64 - %c4096_i64 = arith.constant 4096 : i64 - %c1024_i32 = arith.constant 1024 : i32 - - %ub_in = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c4096_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_in, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %_:1 = scf.for %offset = %c0 to %c1024 step %c128 iter_args(%remaining = %c1024_i32) -> (i32) { - %mask, %next_remaining = pto.plt_b16 %remaining : i32 -> !pto.mask, i32 - %out = pto.vlds %ub_in[%offset] {dist = "US_B16"} : !pto.ptr -> !pto.vreg<128xi16> - pto.vsts %out, %ub_out[%offset], %mask : !pto.vreg<128xi16>, !pto.ptr, !pto.mask - scf.yield %next_remaining : i32 - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg1, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/vector-load-store/vlds-us-b16/launch.cpp b/test/vpto/cases/micro-op/vector-load-store/vlds-us-b16/launch.cpp deleted file mode 100644 index 4ecd1586e..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vlds-us-b16/launch.cpp +++ /dev/null @@ -1,70 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/vector-load-store/vlds-us-b16 -// family: vector-load-store -// target_ops: pto.vlds -// scenarios: core-i16, full-mask, aligned, dist-us-b16 -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -// --------------------------------------------------------------------------- -// PTOAS compatibility layer -// -// The upstream pto-isa headers reference some FP8/FP4 types and the -// __VEC_SCOPE__ marker that are not available on every AICore arch/toolchain -// combination (e.g. __NPU_ARCH__==2201). -// -// For our PTOAS-generated kernels we don't rely on these types today, but the -// headers still mention them in templates/static_asserts. Provide minimal -// fallbacks to keep compilation working on dav-c220. -// --------------------------------------------------------------------------- -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include - -// AICore printf support is gated behind `--cce-enable-print` on some -// toolchains. When enabled, include the CCE print header so `cce::printf` -// resolves in device compilation. -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -// Some PTO-ISA types are only available in the __CCE_AICORE__ compilation -// path, but `bisheng -xcce` still performs a host-side parse pass. -// Provide minimal fallbacks only when the corresponding header wasn't -// pulled in by the selected arch implementation. -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vabs_kernel_2d(__gm__ float *v1, - __gm__ float *v2); - -void LaunchVabs_kernel_2d(float *v1, float *v2, void *stream) { - vabs_kernel_2d<<<1, nullptr, stream>>>((__gm__ float *)v1, (__gm__ float *)v2); -} diff --git a/test/vpto/cases/micro-op/vector-load-store/vlds-us-b16/main.cpp b/test/vpto/cases/micro-op/vector-load-store/vlds-us-b16/main.cpp deleted file mode 100644 index 2c4c9b679..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vlds-us-b16/main.cpp +++ /dev/null @@ -1,130 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/vector-load-store/vlds-us-b16 -// family: vector-load-store -// target_ops: pto.vlds -// scenarios: core-i16, full-mask, aligned, dist-us-b16 -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -This program is free software, you can redistribute it and/or modify it under the terms and conditions of -CANN Open Software License Agreement Version 2.0 (the "License"). -Please refer to the License for details. You may not use this file except in compliance with the License. -THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -See LICENSE in the root of the software repository for the full text of the License. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include -#include - -using namespace PtoTestCommon; - -#ifndef TMRGSORT_HPP -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, (int)_ret, __FILE__, __LINE__); \ - const char *_recent = aclGetRecentErrMsg(); \ - if (_recent != nullptr && _recent[0] != '\0') { \ - std::fprintf(stderr, "[ERROR] RecentErrMsg: %s\n", _recent); \ - } \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVabs_kernel_2d(float *v1, float *v2, void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(float); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(float); - float *v1Host = nullptr; - float *v1Device = nullptr; - float *v2Host = nullptr; - float *v2Device = nullptr; - - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { - deviceId = std::atoi(envDevice); - } - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVabs_kernel_2d(v1Device, v2Device, stream); - - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v2Host, fileSize_v2, v2Device, fileSize_v2, ACL_MEMCPY_DEVICE_TO_HOST)); - - WriteFile("./v2.bin", v2Host, fileSize_v2); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - if (stream != nullptr) { - const aclError _ret = aclrtDestroyStream(stream); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtDestroyStream(stream)", (int)_ret, __FILE__, __LINE__); - } - stream = nullptr; - } - if (deviceSet) { - const aclError _ret = aclrtResetDevice(deviceId); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtResetDevice(deviceId)", (int)_ret, __FILE__, __LINE__); - } - } - if (aclInited) { - const aclError _ret = aclFinalize(); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclFinalize()", (int)_ret, __FILE__, __LINE__); - } - } - - return rc; -} diff --git a/test/vpto/cases/micro-op/vector-load-store/vlds/compare.py b/test/vpto/cases/micro-op/vector-load-store/vlds/compare.py deleted file mode 100755 index 1c07e2d7c..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vlds/compare.py +++ /dev/null @@ -1,209 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/vector-load-store/vlds -# family: vector-load-store -# target_ops: pto.vlds -# scenarios: core-f32, contiguous, full-mask, aligned, dist-norm -# NOTE: bulk-generated coverage skeleton. -# coding=utf-8 - -import os -import sys -import numpy as np - - -REPEAT_BYTES = 256 - - -def _ceil_div(x, y): - return (x + y - 1) // y - - -def _packed_pred_storage_bytes(logical_elems, src_elem_bytes): - logical_elems = int(logical_elems) - src_elem_bytes = int(src_elem_bytes) - if logical_elems <= 0: - raise ValueError(f"logical_elems must be > 0, got {logical_elems}") - if src_elem_bytes not in (1, 2, 4): - raise ValueError(f"unsupported packed predicate source size: {src_elem_bytes}") - - repeat_elems = REPEAT_BYTES // src_elem_bytes - if src_elem_bytes == 4: - repeat_times = _ceil_div(logical_elems, repeat_elems) + 1 - loop_count = repeat_times // 2 - return loop_count * 16 - - repeat_times = _ceil_div(logical_elems, repeat_elems) - return repeat_times * (repeat_elems // 8) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - g = golden.astype(np.int64, copy=False) - o = output.astype(np.int64, copy=False) - else: - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - abs_diff = np.abs(g - o) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={g[idx]}, out={o[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_bin_prefix(golden_path, output_path, dtype, eps, count): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - try: - count = int(count) - except Exception: - print(f"[ERROR] Invalid prefix count: {count}") - return False - if count <= 0: - print(f"[ERROR] Invalid prefix count: {count}") - return False - - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np, count=count) - output = np.fromfile(output_path, dtype=dtype_np, count=count) - - if golden.size != count or output.size != count: - print( - f"[ERROR] Prefix read too small: need={count} elems, " - f"golden={golden.size}, out={output.size}" - ) - return False - - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - g = golden.astype(np.int64, copy=False) - o = output.astype(np.int64, copy=False) - else: - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - abs_diff = np.abs(g - o) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch (prefix): {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={g[idx]}, out={o[idx]}, dtype={dtype_np}, count={count})" - ) - else: - print(f"[ERROR] Mismatch (prefix): {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, logical_elems, src_elem_bytes): - """ - Compare outputs of pto.tcmp / pto.tcmps. - - PTO-ISA stores packed predicate results as a linear PK byte stream via - `psts`, with the exact written prefix length determined by the typed - TCMP/TCMPS repeat schedule. Compare only that semantic prefix. - """ - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - try: - logical_elems = int(logical_elems) - src_elem_bytes = int(src_elem_bytes) - except Exception: - print( - "[ERROR] Invalid packed mask compare arguments: " - f"logical_elems={logical_elems} src_elem_bytes={src_elem_bytes}" - ) - return False - if logical_elems <= 0 or src_elem_bytes <= 0: - print( - "[ERROR] Invalid packed mask compare arguments: " - f"logical_elems={logical_elems} src_elem_bytes={src_elem_bytes}" - ) - return False - - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - try: - prefix_bytes = _packed_pred_storage_bytes(logical_elems, src_elem_bytes) - except ValueError as exc: - print(f"[ERROR] {exc}") - return False - - if golden.size < prefix_bytes or output.size < prefix_bytes: - print( - f"[ERROR] Packed mask buffer too small: need={prefix_bytes} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - - golden_sel = golden[:prefix_bytes] - output_sel = output[:prefix_bytes] - - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, " - f"logical_elems={logical_elems}, src_elem_bytes={src_elem_bytes}, prefix_bytes={prefix_bytes})" - ) - return False - return True - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = True - ok = compare_bin("golden_v2.bin", "v2.bin", np.float32, 0.0001) and ok - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/vector-load-store/vlds/golden.py b/test/vpto/cases/micro-op/vector-load-store/vlds/golden.py deleted file mode 100755 index 21c58baab..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vlds/golden.py +++ /dev/null @@ -1,51 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/vector-load-store/vlds -# family: vector-load-store -# target_ops: pto.vlds -# scenarios: core-f32, contiguous, full-mask, aligned, dist-norm -# NOTE: bulk-generated coverage skeleton. -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -ROWS = 32 -COLS = 32 -SEED = 19 - - -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) - v1 = rng.uniform(-8.0, 8.0, size=(ROWS, COLS)).astype(np.float32) - v2 = np.zeros((ROWS, COLS), dtype=np.float32) - golden_v2 = v1.astype(np.float32, copy=False) - - output_dir.mkdir(parents=True, exist_ok=True) - v1.reshape(-1).tofile(output_dir / "v1.bin") - v2.reshape(-1).tofile(output_dir / "v2.bin") - golden_v2.reshape(-1).tofile(output_dir / "golden_v2.bin") - - -def main() -> None: - parser = argparse.ArgumentParser( - description="Generate numpy-based inputs/golden for VPTO micro-op vlds validation." - ) - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/vector-load-store/vlds/kernel.pto b/test/vpto/cases/micro-op/vector-load-store/vlds/kernel.pto deleted file mode 100644 index 3ba1b9aa5..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vlds/kernel.pto +++ /dev/null @@ -1,69 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/vector-load-store/vlds -// family: vector-load-store -// target_ops: pto.vlds -// scenarios: core-f32, contiguous, full-mask, aligned, dist-norm -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -// ============================================================================= -// abs_kernel_2d: Element-wise absolute value on a 32x32 f32 tile -// ============================================================================= -// This kernel computes abs(input) for a 32x32 float32 matrix (1024 elements). -// -// Memory Layout: -// - Input: arg0 -> GM (Global Memory) -// - Output: arg1 -> GM (Global Memory) -// - UB (Unified Buffer) at offset 0: input tile (4096 bytes = 32*32*4) -// - UB at offset 4096: output tile (4096 bytes = 32*32*4) -// -// Pipeline: -// 1. DMA: GM -> UB (MTE2 pipe) - copy input tile to UB -// 2. Sync: wait for MTE2 -> V pipe handoff -// 3. Compute: vabs on 64-element vectors (V pipe) - 16 iterations for 1024 elements -// 4. Sync: wait for V -> MTE3 pipe handoff -// 5. DMA: UB -> GM (MTE3 pipe) - copy result tile back to GM -// ============================================================================= - -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vabs_kernel_2d(%arg0: !pto.ptr, %arg1: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c64 = arith.constant 64 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c128_i64 = arith.constant 128 : i64 - %c4096_i64 = arith.constant 4096 : i64 - %c1024_i32 = arith.constant 1024 : i32 - - %ub_in = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c4096_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_in, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %_:1 = scf.for %offset = %c0 to %c1024 step %c64 iter_args(%remaining = %c1024_i32) -> (i32) { - %mask, %next_remaining = pto.plt_b32 %remaining : i32 -> !pto.mask, i32 - %out = pto.vlds %ub_in[%offset] {dist = "NORM"} : !pto.ptr -> !pto.vreg<64xf32> - pto.vsts %out, %ub_out[%offset], %mask : !pto.vreg<64xf32>, !pto.ptr, !pto.mask - scf.yield %next_remaining : i32 - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg1, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/vector-load-store/vlds/launch.cpp b/test/vpto/cases/micro-op/vector-load-store/vlds/launch.cpp deleted file mode 100644 index 2e2fa02fb..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vlds/launch.cpp +++ /dev/null @@ -1,70 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/vector-load-store/vlds -// family: vector-load-store -// target_ops: pto.vlds -// scenarios: core-f32, contiguous, full-mask, aligned, dist-norm -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -// --------------------------------------------------------------------------- -// PTOAS compatibility layer -// -// The upstream pto-isa headers reference some FP8/FP4 types and the -// __VEC_SCOPE__ marker that are not available on every AICore arch/toolchain -// combination (e.g. __NPU_ARCH__==2201). -// -// For our PTOAS-generated kernels we don't rely on these types today, but the -// headers still mention them in templates/static_asserts. Provide minimal -// fallbacks to keep compilation working on dav-c220. -// --------------------------------------------------------------------------- -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include - -// AICore printf support is gated behind `--cce-enable-print` on some -// toolchains. When enabled, include the CCE print header so `cce::printf` -// resolves in device compilation. -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -// Some PTO-ISA types are only available in the __CCE_AICORE__ compilation -// path, but `bisheng -xcce` still performs a host-side parse pass. -// Provide minimal fallbacks only when the corresponding header wasn't -// pulled in by the selected arch implementation. -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vabs_kernel_2d(__gm__ float *v1, - __gm__ float *v2); - -void LaunchVabs_kernel_2d(float *v1, float *v2, void *stream) { - vabs_kernel_2d<<<1, nullptr, stream>>>((__gm__ float *)v1, (__gm__ float *)v2); -} diff --git a/test/vpto/cases/micro-op/vector-load-store/vlds/main.cpp b/test/vpto/cases/micro-op/vector-load-store/vlds/main.cpp deleted file mode 100644 index ab816737d..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vlds/main.cpp +++ /dev/null @@ -1,130 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/vector-load-store/vlds -// family: vector-load-store -// target_ops: pto.vlds -// scenarios: core-f32, contiguous, full-mask, aligned, dist-norm -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -This program is free software, you can redistribute it and/or modify it under the terms and conditions of -CANN Open Software License Agreement Version 2.0 (the "License"). -Please refer to the License for details. You may not use this file except in compliance with the License. -THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -See LICENSE in the root of the software repository for the full text of the License. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include -#include - -using namespace PtoTestCommon; - -#ifndef TMRGSORT_HPP -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, (int)_ret, __FILE__, __LINE__); \ - const char *_recent = aclGetRecentErrMsg(); \ - if (_recent != nullptr && _recent[0] != '\0') { \ - std::fprintf(stderr, "[ERROR] RecentErrMsg: %s\n", _recent); \ - } \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVabs_kernel_2d(float *v1, float *v2, void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(float); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(float); - float *v1Host = nullptr; - float *v1Device = nullptr; - float *v2Host = nullptr; - float *v2Device = nullptr; - - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { - deviceId = std::atoi(envDevice); - } - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVabs_kernel_2d(v1Device, v2Device, stream); - - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v2Host, fileSize_v2, v2Device, fileSize_v2, ACL_MEMCPY_DEVICE_TO_HOST)); - - WriteFile("./v2.bin", v2Host, fileSize_v2); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - if (stream != nullptr) { - const aclError _ret = aclrtDestroyStream(stream); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtDestroyStream(stream)", (int)_ret, __FILE__, __LINE__); - } - stream = nullptr; - } - if (deviceSet) { - const aclError _ret = aclrtResetDevice(deviceId); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtResetDevice(deviceId)", (int)_ret, __FILE__, __LINE__); - } - } - if (aclInited) { - const aclError _ret = aclFinalize(); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclFinalize()", (int)_ret, __FILE__, __LINE__); - } - } - - return rc; -} diff --git a/test/vpto/cases/micro-op/vector-load-store/vldsx2-layout-check/compare.py b/test/vpto/cases/micro-op/vector-load-store/vldsx2-layout-check/compare.py deleted file mode 100755 index a4c5fae81..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vldsx2-layout-check/compare.py +++ /dev/null @@ -1,209 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/vector-load-store/vldsx2-layout-check -# family: vector-load-store -# target_ops: pto.vldsx2 -# scenarios: core-f32, full-mask, paired-roundtrip, dintlv, lane-order -# NOTE: bulk-generated coverage skeleton. -# coding=utf-8 - -import os -import sys -import numpy as np - - -REPEAT_BYTES = 256 - - -def _ceil_div(x, y): - return (x + y - 1) // y - - -def _packed_pred_storage_bytes(logical_elems, src_elem_bytes): - logical_elems = int(logical_elems) - src_elem_bytes = int(src_elem_bytes) - if logical_elems <= 0: - raise ValueError(f"logical_elems must be > 0, got {logical_elems}") - if src_elem_bytes not in (1, 2, 4): - raise ValueError(f"unsupported packed predicate source size: {src_elem_bytes}") - - repeat_elems = REPEAT_BYTES // src_elem_bytes - if src_elem_bytes == 4: - repeat_times = _ceil_div(logical_elems, repeat_elems) + 1 - loop_count = repeat_times // 2 - return loop_count * 16 - - repeat_times = _ceil_div(logical_elems, repeat_elems) - return repeat_times * (repeat_elems // 8) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - g = golden.astype(np.int64, copy=False) - o = output.astype(np.int64, copy=False) - else: - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - abs_diff = np.abs(g - o) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={g[idx]}, out={o[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_bin_prefix(golden_path, output_path, dtype, eps, count): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - try: - count = int(count) - except Exception: - print(f"[ERROR] Invalid prefix count: {count}") - return False - if count <= 0: - print(f"[ERROR] Invalid prefix count: {count}") - return False - - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np, count=count) - output = np.fromfile(output_path, dtype=dtype_np, count=count) - - if golden.size != count or output.size != count: - print( - f"[ERROR] Prefix read too small: need={count} elems, " - f"golden={golden.size}, out={output.size}" - ) - return False - - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - g = golden.astype(np.int64, copy=False) - o = output.astype(np.int64, copy=False) - else: - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - abs_diff = np.abs(g - o) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch (prefix): {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={g[idx]}, out={o[idx]}, dtype={dtype_np}, count={count})" - ) - else: - print(f"[ERROR] Mismatch (prefix): {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, logical_elems, src_elem_bytes): - """ - Compare outputs of pto.tcmp / pto.tcmps. - - PTO-ISA stores packed predicate results as a linear PK byte stream via - `psts`, with the exact written prefix length determined by the typed - TCMP/TCMPS repeat schedule. Compare only that semantic prefix. - """ - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - try: - logical_elems = int(logical_elems) - src_elem_bytes = int(src_elem_bytes) - except Exception: - print( - "[ERROR] Invalid packed mask compare arguments: " - f"logical_elems={logical_elems} src_elem_bytes={src_elem_bytes}" - ) - return False - if logical_elems <= 0 or src_elem_bytes <= 0: - print( - "[ERROR] Invalid packed mask compare arguments: " - f"logical_elems={logical_elems} src_elem_bytes={src_elem_bytes}" - ) - return False - - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - try: - prefix_bytes = _packed_pred_storage_bytes(logical_elems, src_elem_bytes) - except ValueError as exc: - print(f"[ERROR] {exc}") - return False - - if golden.size < prefix_bytes or output.size < prefix_bytes: - print( - f"[ERROR] Packed mask buffer too small: need={prefix_bytes} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - - golden_sel = golden[:prefix_bytes] - output_sel = output[:prefix_bytes] - - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, " - f"logical_elems={logical_elems}, src_elem_bytes={src_elem_bytes}, prefix_bytes={prefix_bytes})" - ) - return False - return True - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = True - ok = compare_bin("golden_v2.bin", "v2.bin", np.float32, 0.0001) and ok - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/vector-load-store/vldsx2-layout-check/golden.py b/test/vpto/cases/micro-op/vector-load-store/vldsx2-layout-check/golden.py deleted file mode 100755 index 8c481d96b..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vldsx2-layout-check/golden.py +++ /dev/null @@ -1,62 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/vector-load-store/vldsx2-layout-check -# family: vector-load-store -# target_ops: pto.vldsx2 -# scenarios: core-f32, full-mask, dintlv, lane-order, split-observation -# NOTE: bulk-generated coverage skeleton. -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -ROWS = 32 -COLS = 32 -ACTIVE = 128 -SEED = 19 - - -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) - v1 = rng.uniform(-8.0, 8.0, size=(ROWS, COLS)).astype(np.float32) - v2 = np.zeros((ROWS, COLS), dtype=np.float32) - golden_v2 = np.zeros((ROWS, COLS), dtype=np.float32).reshape(-1) - flat = v1.reshape(-1) - - # DINTLV_B32 exposes the two deinterleaved 64-lane results independently. - # Observe them through two plain NORM_B32 stores: - # low -> output[offset : offset + 64] - # high -> output[offset + 64 : offset + 128] - for base in range(0, ROWS * COLS, ACTIVE): - chunk = flat[base : base + ACTIVE] - golden_v2[base : base + 64] = chunk[0::2] - golden_v2[base + 64 : base + 128] = chunk[1::2] - - output_dir.mkdir(parents=True, exist_ok=True) - v1.reshape(-1).tofile(output_dir / "v1.bin") - v2.reshape(-1).tofile(output_dir / "v2.bin") - golden_v2.tofile(output_dir / "golden_v2.bin") - - -def main() -> None: - parser = argparse.ArgumentParser( - description="Generate numpy-based inputs/golden for VPTO micro-op vldsx2 layout validation." - ) - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/vector-load-store/vldsx2-layout-check/kernel.pto b/test/vpto/cases/micro-op/vector-load-store/vldsx2-layout-check/kernel.pto deleted file mode 100644 index 3bb2b29c6..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vldsx2-layout-check/kernel.pto +++ /dev/null @@ -1,57 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/vector-load-store/vldsx2-layout-check -// family: vector-load-store -// target_ops: pto.vldsx2 -// scenarios: core-f32, full-mask, dintlv, lane-order, split-observation -// ----------------------------------------------------------------------------- -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vldx2_layout_check_kernel(%arg0: !pto.ptr, - %arg1: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c64 = arith.constant 64 : index - %c8 = arith.constant 8 : index - %c128 = arith.constant 128 : index - %c1_i64 = arith.constant 1 : i64 - %c0_i64 = arith.constant 0 : i64 - %c32_i64 = arith.constant 32 : i64 - %c128_i64 = arith.constant 128 : i64 - %c4096_i64 = arith.constant 4096 : i64 - %c64_i32 = arith.constant 64 : i32 - %false = arith.constant false - - %ub_in = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c4096_i64 : i64 -> !pto.ptr - pto.mte_gm_ub %arg0, %ub_in, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - scf.for %group = %c0 to %c8 step %c1 { - %group_base = arith.muli %group, %c128 : index - scf.for %chunk = %c0 to %c128 step %c128 { - %offset = arith.addi %group_base, %chunk : index - %high_offset = arith.addi %offset, %c64 : index - %mask, %remaining = pto.plt_b32 %c64_i32 : i32 -> !pto.mask, i32 - %x, %y = pto.vldsx2 %ub_in[%offset], "DINTLV_B32" - : !pto.ptr, index -> !pto.vreg<64xf32>, !pto.vreg<64xf32> - pto.vsts %x, %ub_out[%offset], %mask {dist = "NORM_B32"} - : !pto.vreg<64xf32>, !pto.ptr, !pto.mask - pto.vsts %y, %ub_out[%high_offset], %mask {dist = "NORM_B32"} - : !pto.vreg<64xf32>, !pto.ptr, !pto.mask - } - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg1, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/vector-load-store/vldsx2-layout-check/launch.cpp b/test/vpto/cases/micro-op/vector-load-store/vldsx2-layout-check/launch.cpp deleted file mode 100644 index d06dda18c..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vldsx2-layout-check/launch.cpp +++ /dev/null @@ -1,71 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/vector-load-store/vldsx2-layout-check -// family: vector-load-store -// target_ops: pto.vldsx2 -// scenarios: core-f32, full-mask, paired-roundtrip, dintlv, lane-order -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -// --------------------------------------------------------------------------- -// PTOAS compatibility layer -// -// The upstream pto-isa headers reference some FP8/FP4 types and the -// __VEC_SCOPE__ marker that are not available on every AICore arch/toolchain -// combination (e.g. __NPU_ARCH__==2201). -// -// For our PTOAS-generated kernels we don't rely on these types today, but the -// headers still mention them in templates/static_asserts. Provide minimal -// fallbacks to keep compilation working on dav-c220. -// --------------------------------------------------------------------------- -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include - -// AICore printf support is gated behind `--cce-enable-print` on some -// toolchains. When enabled, include the CCE print header so `cce::printf` -// resolves in device compilation. -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -// Some PTO-ISA types are only available in the __CCE_AICORE__ compilation -// path, but `bisheng -xcce` still performs a host-side parse pass. -// Provide minimal fallbacks only when the corresponding header wasn't -// pulled in by the selected arch implementation. -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vldx2_layout_check_kernel(__gm__ float *v1, - __gm__ float *v2); - -void LaunchVldx2_layout_check_kernel(float *v1, float *v2, void *stream) { - vldx2_layout_check_kernel<<<1, nullptr, stream>>>((__gm__ float *)v1, - (__gm__ float *)v2); -} diff --git a/test/vpto/cases/micro-op/vector-load-store/vldsx2-layout-check/main.cpp b/test/vpto/cases/micro-op/vector-load-store/vldsx2-layout-check/main.cpp deleted file mode 100644 index 45e578b57..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vldsx2-layout-check/main.cpp +++ /dev/null @@ -1,130 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/vector-load-store/vldsx2-layout-check -// family: vector-load-store -// target_ops: pto.vldsx2 -// scenarios: core-f32, full-mask, paired-roundtrip, dintlv, lane-order -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -This program is free software, you can redistribute it and/or modify it under the terms and conditions of -CANN Open Software License Agreement Version 2.0 (the "License"). -Please refer to the License for details. You may not use this file except in compliance with the License. -THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -See LICENSE in the root of the software repository for the full text of the License. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include -#include - -using namespace PtoTestCommon; - -#ifndef TMRGSORT_HPP -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, (int)_ret, __FILE__, __LINE__); \ - const char *_recent = aclGetRecentErrMsg(); \ - if (_recent != nullptr && _recent[0] != '\0') { \ - std::fprintf(stderr, "[ERROR] RecentErrMsg: %s\n", _recent); \ - } \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVldx2_layout_check_kernel(float *v1, float *v2, void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(float); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(float); - float *v1Host = nullptr; - float *v1Device = nullptr; - float *v2Host = nullptr; - float *v2Device = nullptr; - - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { - deviceId = std::atoi(envDevice); - } - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVldx2_layout_check_kernel(v1Device, v2Device, stream); - - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v2Host, fileSize_v2, v2Device, fileSize_v2, ACL_MEMCPY_DEVICE_TO_HOST)); - - WriteFile("./v2.bin", v2Host, fileSize_v2); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - if (stream != nullptr) { - const aclError _ret = aclrtDestroyStream(stream); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtDestroyStream(stream)", (int)_ret, __FILE__, __LINE__); - } - stream = nullptr; - } - if (deviceSet) { - const aclError _ret = aclrtResetDevice(deviceId); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtResetDevice(deviceId)", (int)_ret, __FILE__, __LINE__); - } - } - if (aclInited) { - const aclError _ret = aclFinalize(); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclFinalize()", (int)_ret, __FILE__, __LINE__); - } - } - - return rc; -} diff --git a/test/vpto/cases/micro-op/vector-load-store/vldsx2-vstsx2-b8-f32/compare.py b/test/vpto/cases/micro-op/vector-load-store/vldsx2-vstsx2-b8-f32/compare.py deleted file mode 100644 index af950320b..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vldsx2-vstsx2-b8-f32/compare.py +++ /dev/null @@ -1,201 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/vector-load-store/vldsx2-vstsx2-b8-f32 -# family: vector-load-store -# target_ops: pto.vldsx2, pto.vstsx2 -# scenarios: core-f32, full-mask, paired-roundtrip, dintlv-b8-intlv-b8, width-agnostic-dist -# coding=utf-8 - -import os -import sys -import numpy as np - - -REPEAT_BYTES = 256 - - -def _ceil_div(x, y): - return (x + y - 1) // y - - -def _packed_pred_storage_bytes(logical_elems, src_elem_bytes): - logical_elems = int(logical_elems) - src_elem_bytes = int(src_elem_bytes) - if logical_elems <= 0: - raise ValueError(f"logical_elems must be > 0, got {logical_elems}") - if src_elem_bytes not in (1, 2, 4): - raise ValueError(f"unsupported packed predicate source size: {src_elem_bytes}") - - repeat_elems = REPEAT_BYTES // src_elem_bytes - if src_elem_bytes == 4: - repeat_times = _ceil_div(logical_elems, repeat_elems) + 1 - loop_count = repeat_times // 2 - return loop_count * 16 - - repeat_times = _ceil_div(logical_elems, repeat_elems) - return repeat_times * (repeat_elems // 8) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - g = golden.astype(np.int64, copy=False) - o = output.astype(np.int64, copy=False) - else: - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - abs_diff = np.abs(g - o) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={g[idx]}, out={o[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_bin_prefix(golden_path, output_path, dtype, eps, count): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - try: - count = int(count) - except Exception: - print(f"[ERROR] Invalid prefix count: {count}") - return False - if count <= 0: - print(f"[ERROR] Invalid prefix count: {count}") - return False - - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np, count=count) - output = np.fromfile(output_path, dtype=dtype_np, count=count) - - if golden.size != count or output.size != count: - print( - f"[ERROR] Prefix read too small: need={count} elems, " - f"golden={golden.size}, out={output.size}" - ) - return False - - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - g = golden.astype(np.int64, copy=False) - o = output.astype(np.int64, copy=False) - else: - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - abs_diff = np.abs(g - o) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch (prefix): {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={g[idx]}, out={o[idx]}, dtype={dtype_np}, count={count})" - ) - else: - print(f"[ERROR] Mismatch (prefix): {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, logical_elems, src_elem_bytes): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - try: - logical_elems = int(logical_elems) - src_elem_bytes = int(src_elem_bytes) - except Exception: - print( - "[ERROR] Invalid packed mask compare arguments: " - f"logical_elems={logical_elems} src_elem_bytes={src_elem_bytes}" - ) - return False - if logical_elems <= 0 or src_elem_bytes <= 0: - print( - "[ERROR] Invalid packed mask compare arguments: " - f"logical_elems={logical_elems} src_elem_bytes={src_elem_bytes}" - ) - return False - - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - try: - prefix_bytes = _packed_pred_storage_bytes(logical_elems, src_elem_bytes) - except ValueError as exc: - print(f"[ERROR] {exc}") - return False - - if golden.size < prefix_bytes or output.size < prefix_bytes: - print( - f"[ERROR] Packed mask buffer too small: need={prefix_bytes} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - - golden_sel = golden[:prefix_bytes] - output_sel = output[:prefix_bytes] - - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, " - f"logical_elems={logical_elems}, src_elem_bytes={src_elem_bytes}, prefix_bytes={prefix_bytes})" - ) - return False - return True - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = True - ok = compare_bin("golden_v2.bin", "v2.bin", np.float32, 0.0001) and ok - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/vector-load-store/vldsx2-vstsx2-b8-f32/golden.py b/test/vpto/cases/micro-op/vector-load-store/vldsx2-vstsx2-b8-f32/golden.py deleted file mode 100644 index 6732c8799..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vldsx2-vstsx2-b8-f32/golden.py +++ /dev/null @@ -1,50 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/vector-load-store/vldsx2-vstsx2-b8-f32 -# family: vector-load-store -# target_ops: pto.vldsx2, pto.vstsx2 -# scenarios: core-f32, full-mask, paired-roundtrip, dintlv-b8-intlv-b8, width-agnostic-dist -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -ROWS = 32 -COLS = 32 -SEED = 19 - - -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) - v1 = rng.uniform(-8.0, 8.0, size=(ROWS, COLS)).astype(np.float32) - v2 = np.zeros((ROWS, COLS), dtype=np.float32) - golden_v2 = np.array(v1, copy=True) - - output_dir.mkdir(parents=True, exist_ok=True) - v1.reshape(-1).tofile(output_dir / "v1.bin") - v2.reshape(-1).tofile(output_dir / "v2.bin") - golden_v2.reshape(-1).tofile(output_dir / "golden_v2.bin") - - -def main() -> None: - parser = argparse.ArgumentParser( - description="Generate numpy-based inputs/golden for VPTO micro-op vldsx2-vstsx2-b8-f32 validation." - ) - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/vector-load-store/vldsx2-vstsx2-b8-f32/kernel.pto b/test/vpto/cases/micro-op/vector-load-store/vldsx2-vstsx2-b8-f32/kernel.pto deleted file mode 100644 index c4346777f..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vldsx2-vstsx2-b8-f32/kernel.pto +++ /dev/null @@ -1,54 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/vector-load-store/vldsx2-vstsx2-b8-f32 -// family: vector-load-store -// target_ops: pto.vldsx2, pto.vstsx2 -// scenarios: core-f32, full-mask, paired-roundtrip, dintlv-b8-intlv-b8, width-agnostic-dist -// ----------------------------------------------------------------------------- -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vldx2_vstsx2_b8_f32_kernel(%arg0: !pto.ptr, - %arg1: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c8 = arith.constant 8 : index - %c128 = arith.constant 128 : index - %c1_i64 = arith.constant 1 : i64 - %c0_i64 = arith.constant 0 : i64 - %c32_i64 = arith.constant 32 : i64 - %c128_i64 = arith.constant 128 : i64 - %c4096_i64 = arith.constant 4096 : i64 - %c64_i32 = arith.constant 64 : i32 - %false = arith.constant false - - %ub_in = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c4096_i64 : i64 -> !pto.ptr - pto.mte_gm_ub %arg0, %ub_in, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - scf.for %group = %c0 to %c8 step %c1 { - %group_base = arith.muli %group, %c128 : index - scf.for %chunk = %c0 to %c128 step %c128 { - %offset = arith.addi %group_base, %chunk : index - %mask, %remaining = pto.plt_b32 %c64_i32 : i32 -> !pto.mask, i32 - %low, %high = pto.vldsx2 %ub_in[%offset], "DINTLV_B8" - : !pto.ptr, index -> !pto.vreg<64xf32>, !pto.vreg<64xf32> - pto.vstsx2 %low, %high, %ub_out[%offset], "INTLV_B8", %mask - : !pto.vreg<64xf32>, !pto.vreg<64xf32>, !pto.ptr, index, - !pto.mask - } - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg1, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/vector-load-store/vldsx2-vstsx2-b8-f32/launch.cpp b/test/vpto/cases/micro-op/vector-load-store/vldsx2-vstsx2-b8-f32/launch.cpp deleted file mode 100644 index beadc1f7e..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vldsx2-vstsx2-b8-f32/launch.cpp +++ /dev/null @@ -1,50 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/vector-load-store/vldsx2-vstsx2-b8-f32 -// family: vector-load-store -// target_ops: pto.vldsx2, pto.vstsx2 -// scenarios: core-f32, full-mask, paired-roundtrip, dintlv-b8-intlv-b8, width-agnostic-dist -// ----------------------------------------------------------------------------- -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include - -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vldx2_vstsx2_b8_f32_kernel(__gm__ float *v1, - __gm__ float *v2); - -void LaunchVldx2_vstsx2_b8_f32_kernel(float *v1, float *v2, void *stream) { - vldx2_vstsx2_b8_f32_kernel<<<1, nullptr, stream>>>((__gm__ float *)v1, (__gm__ float *)v2); -} diff --git a/test/vpto/cases/micro-op/vector-load-store/vldsx2-vstsx2-b8-f32/main.cpp b/test/vpto/cases/micro-op/vector-load-store/vldsx2-vstsx2-b8-f32/main.cpp deleted file mode 100644 index 61686d35b..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vldsx2-vstsx2-b8-f32/main.cpp +++ /dev/null @@ -1,128 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/vector-load-store/vldsx2-vstsx2-b8-f32 -// family: vector-load-store -// target_ops: pto.vldsx2, pto.vstsx2 -// scenarios: core-f32, full-mask, paired-roundtrip, dintlv-b8-intlv-b8, width-agnostic-dist -// ----------------------------------------------------------------------------- -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -This program is free software, you can redistribute it and/or modify it under the terms and conditions of -CANN Open Software License Agreement Version 2.0 (the "License"). -Please refer to the License for details. You may not use this file except in compliance with the License. -THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -See LICENSE in the root of the software repository for the full text of the License. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include -#include - -using namespace PtoTestCommon; - -#ifndef TMRGSORT_HPP -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, (int)_ret, __FILE__, __LINE__); \ - const char *_recent = aclGetRecentErrMsg(); \ - if (_recent != nullptr && _recent[0] != '\0') { \ - std::fprintf(stderr, "[ERROR] RecentErrMsg: %s\n", _recent); \ - } \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVldx2_vstsx2_b8_f32_kernel(float *v1, float *v2, void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(float); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(float); - float *v1Host = nullptr; - float *v1Device = nullptr; - float *v2Host = nullptr; - float *v2Device = nullptr; - - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { - deviceId = std::atoi(envDevice); - } - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVldx2_vstsx2_b8_f32_kernel(v1Device, v2Device, stream); - - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v2Host, fileSize_v2, v2Device, fileSize_v2, ACL_MEMCPY_DEVICE_TO_HOST)); - - WriteFile("./v2.bin", v2Host, fileSize_v2); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - if (stream != nullptr) { - const aclError _ret = aclrtDestroyStream(stream); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtDestroyStream(stream)", (int)_ret, __FILE__, __LINE__); - } - stream = nullptr; - } - if (deviceSet) { - const aclError _ret = aclrtResetDevice(deviceId); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtResetDevice(deviceId)", (int)_ret, __FILE__, __LINE__); - } - } - if (aclInited) { - const aclError _ret = aclFinalize(); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclFinalize()", (int)_ret, __FILE__, __LINE__); - } - } - - return rc; -} diff --git a/test/vpto/cases/micro-op/vector-load-store/vldsx2-vstsx2/compare.py b/test/vpto/cases/micro-op/vector-load-store/vldsx2-vstsx2/compare.py deleted file mode 100755 index b28c98567..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vldsx2-vstsx2/compare.py +++ /dev/null @@ -1,209 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/vector-load-store/vldsx2-vstsx2 -# family: vector-load-store -# target_ops: pto.vldsx2, pto.vstsx2 -# scenarios: core-f32, full-mask, paired-roundtrip, dintlv -# NOTE: bulk-generated coverage skeleton. -# coding=utf-8 - -import os -import sys -import numpy as np - - -REPEAT_BYTES = 256 - - -def _ceil_div(x, y): - return (x + y - 1) // y - - -def _packed_pred_storage_bytes(logical_elems, src_elem_bytes): - logical_elems = int(logical_elems) - src_elem_bytes = int(src_elem_bytes) - if logical_elems <= 0: - raise ValueError(f"logical_elems must be > 0, got {logical_elems}") - if src_elem_bytes not in (1, 2, 4): - raise ValueError(f"unsupported packed predicate source size: {src_elem_bytes}") - - repeat_elems = REPEAT_BYTES // src_elem_bytes - if src_elem_bytes == 4: - repeat_times = _ceil_div(logical_elems, repeat_elems) + 1 - loop_count = repeat_times // 2 - return loop_count * 16 - - repeat_times = _ceil_div(logical_elems, repeat_elems) - return repeat_times * (repeat_elems // 8) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - g = golden.astype(np.int64, copy=False) - o = output.astype(np.int64, copy=False) - else: - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - abs_diff = np.abs(g - o) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={g[idx]}, out={o[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_bin_prefix(golden_path, output_path, dtype, eps, count): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - try: - count = int(count) - except Exception: - print(f"[ERROR] Invalid prefix count: {count}") - return False - if count <= 0: - print(f"[ERROR] Invalid prefix count: {count}") - return False - - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np, count=count) - output = np.fromfile(output_path, dtype=dtype_np, count=count) - - if golden.size != count or output.size != count: - print( - f"[ERROR] Prefix read too small: need={count} elems, " - f"golden={golden.size}, out={output.size}" - ) - return False - - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - g = golden.astype(np.int64, copy=False) - o = output.astype(np.int64, copy=False) - else: - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - abs_diff = np.abs(g - o) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch (prefix): {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={g[idx]}, out={o[idx]}, dtype={dtype_np}, count={count})" - ) - else: - print(f"[ERROR] Mismatch (prefix): {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, logical_elems, src_elem_bytes): - """ - Compare outputs of pto.tcmp / pto.tcmps. - - PTO-ISA stores packed predicate results as a linear PK byte stream via - `psts`, with the exact written prefix length determined by the typed - TCMP/TCMPS repeat schedule. Compare only that semantic prefix. - """ - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - try: - logical_elems = int(logical_elems) - src_elem_bytes = int(src_elem_bytes) - except Exception: - print( - "[ERROR] Invalid packed mask compare arguments: " - f"logical_elems={logical_elems} src_elem_bytes={src_elem_bytes}" - ) - return False - if logical_elems <= 0 or src_elem_bytes <= 0: - print( - "[ERROR] Invalid packed mask compare arguments: " - f"logical_elems={logical_elems} src_elem_bytes={src_elem_bytes}" - ) - return False - - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - try: - prefix_bytes = _packed_pred_storage_bytes(logical_elems, src_elem_bytes) - except ValueError as exc: - print(f"[ERROR] {exc}") - return False - - if golden.size < prefix_bytes or output.size < prefix_bytes: - print( - f"[ERROR] Packed mask buffer too small: need={prefix_bytes} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - - golden_sel = golden[:prefix_bytes] - output_sel = output[:prefix_bytes] - - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, " - f"logical_elems={logical_elems}, src_elem_bytes={src_elem_bytes}, prefix_bytes={prefix_bytes})" - ) - return False - return True - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = True - ok = compare_bin("golden_v2.bin", "v2.bin", np.float32, 0.0001) and ok - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/vector-load-store/vldsx2-vstsx2/golden.py b/test/vpto/cases/micro-op/vector-load-store/vldsx2-vstsx2/golden.py deleted file mode 100755 index 14d41a9a3..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vldsx2-vstsx2/golden.py +++ /dev/null @@ -1,52 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/vector-load-store/vldsx2-vstsx2 -# family: vector-load-store -# target_ops: pto.vldsx2, pto.vstsx2 -# scenarios: core-f32, full-mask, paired-roundtrip, dintlv -# NOTE: bulk-generated coverage skeleton. -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -ROWS = 32 -COLS = 32 -ACTIVE = 128 -SEED = 19 - - -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) - v1 = rng.uniform(-8.0, 8.0, size=(ROWS, COLS)).astype(np.float32) - v2 = np.zeros((ROWS, COLS), dtype=np.float32) - golden_v2 = np.array(v1, copy=True) - - output_dir.mkdir(parents=True, exist_ok=True) - v1.reshape(-1).tofile(output_dir / "v1.bin") - v2.reshape(-1).tofile(output_dir / "v2.bin") - golden_v2.reshape(-1).tofile(output_dir / "golden_v2.bin") - - -def main() -> None: - parser = argparse.ArgumentParser( - description="Generate numpy-based inputs/golden for VPTO micro-op vldsx2-vstsx2 validation." - ) - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/vector-load-store/vldsx2-vstsx2/kernel.pto b/test/vpto/cases/micro-op/vector-load-store/vldsx2-vstsx2/kernel.pto deleted file mode 100644 index a51a84105..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vldsx2-vstsx2/kernel.pto +++ /dev/null @@ -1,54 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/vector-load-store/vldsx2-vstsx2 -// family: vector-load-store -// target_ops: pto.vldsx2, pto.vstsx2 -// scenarios: core-f32, full-mask, paired-roundtrip, dintlv -// ----------------------------------------------------------------------------- -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vldx2_vstsx2_kernel(%arg0: !pto.ptr, - %arg1: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c8 = arith.constant 8 : index - %c128 = arith.constant 128 : index - %c1_i64 = arith.constant 1 : i64 - %c0_i64 = arith.constant 0 : i64 - %c32_i64 = arith.constant 32 : i64 - %c128_i64 = arith.constant 128 : i64 - %c4096_i64 = arith.constant 4096 : i64 - %c64_i32 = arith.constant 64 : i32 - %false = arith.constant false - - %ub_in = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c4096_i64 : i64 -> !pto.ptr - pto.mte_gm_ub %arg0, %ub_in, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - scf.for %group = %c0 to %c8 step %c1 { - %group_base = arith.muli %group, %c128 : index - scf.for %chunk = %c0 to %c128 step %c128 { - %offset = arith.addi %group_base, %chunk : index - %mask, %remaining = pto.plt_b32 %c64_i32 : i32 -> !pto.mask, i32 - %low, %high = pto.vldsx2 %ub_in[%offset], "DINTLV_B32" - : !pto.ptr, index -> !pto.vreg<64xf32>, !pto.vreg<64xf32> - pto.vstsx2 %low, %high, %ub_out[%offset], "INTLV_B32", %mask - : !pto.vreg<64xf32>, !pto.vreg<64xf32>, !pto.ptr, index, - !pto.mask - } - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg1, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/vector-load-store/vldsx2-vstsx2/launch.cpp b/test/vpto/cases/micro-op/vector-load-store/vldsx2-vstsx2/launch.cpp deleted file mode 100644 index d7e4c3fed..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vldsx2-vstsx2/launch.cpp +++ /dev/null @@ -1,71 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/vector-load-store/vldsx2-vstsx2 -// family: vector-load-store -// target_ops: pto.vldsx2, pto.vstsx2 -// scenarios: core-f32, full-mask, paired-roundtrip, dintlv -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -// --------------------------------------------------------------------------- -// PTOAS compatibility layer -// -// The upstream pto-isa headers reference some FP8/FP4 types and the -// __VEC_SCOPE__ marker that are not available on every AICore arch/toolchain -// combination (e.g. __NPU_ARCH__==2201). -// -// For our PTOAS-generated kernels we don't rely on these types today, but the -// headers still mention them in templates/static_asserts. Provide minimal -// fallbacks to keep compilation working on dav-c220. -// --------------------------------------------------------------------------- -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include - -// AICore printf support is gated behind `--cce-enable-print` on some -// toolchains. When enabled, include the CCE print header so `cce::printf` -// resolves in device compilation. -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -// Some PTO-ISA types are only available in the __CCE_AICORE__ compilation -// path, but `bisheng -xcce` still performs a host-side parse pass. -// Provide minimal fallbacks only when the corresponding header wasn't -// pulled in by the selected arch implementation. -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vldx2_vstsx2_kernel(__gm__ float *v1, - __gm__ float *v2); - -void LaunchVldx2_vstsx2_kernel(float *v1, float *v2, void *stream) { - vldx2_vstsx2_kernel<<<1, nullptr, stream>>>((__gm__ float *)v1, - (__gm__ float *)v2); -} diff --git a/test/vpto/cases/micro-op/vector-load-store/vldsx2-vstsx2/main.cpp b/test/vpto/cases/micro-op/vector-load-store/vldsx2-vstsx2/main.cpp deleted file mode 100644 index ec0f59491..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vldsx2-vstsx2/main.cpp +++ /dev/null @@ -1,130 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/vector-load-store/vldsx2-vstsx2 -// family: vector-load-store -// target_ops: pto.vldsx2, pto.vstsx2 -// scenarios: core-f32, full-mask, paired-roundtrip, dintlv -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -This program is free software, you can redistribute it and/or modify it under the terms and conditions of -CANN Open Software License Agreement Version 2.0 (the "License"). -Please refer to the License for details. You may not use this file except in compliance with the License. -THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -See LICENSE in the root of the software repository for the full text of the License. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include -#include - -using namespace PtoTestCommon; - -#ifndef TMRGSORT_HPP -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, (int)_ret, __FILE__, __LINE__); \ - const char *_recent = aclGetRecentErrMsg(); \ - if (_recent != nullptr && _recent[0] != '\0') { \ - std::fprintf(stderr, "[ERROR] RecentErrMsg: %s\n", _recent); \ - } \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVldx2_vstsx2_kernel(float *v1, float *v2, void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(float); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(float); - float *v1Host = nullptr; - float *v1Device = nullptr; - float *v2Host = nullptr; - float *v2Device = nullptr; - - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { - deviceId = std::atoi(envDevice); - } - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVldx2_vstsx2_kernel(v1Device, v2Device, stream); - - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v2Host, fileSize_v2, v2Device, fileSize_v2, ACL_MEMCPY_DEVICE_TO_HOST)); - - WriteFile("./v2.bin", v2Host, fileSize_v2); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - if (stream != nullptr) { - const aclError _ret = aclrtDestroyStream(stream); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtDestroyStream(stream)", (int)_ret, __FILE__, __LINE__); - } - stream = nullptr; - } - if (deviceSet) { - const aclError _ret = aclrtResetDevice(deviceId); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtResetDevice(deviceId)", (int)_ret, __FILE__, __LINE__); - } - } - if (aclInited) { - const aclError _ret = aclFinalize(); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclFinalize()", (int)_ret, __FILE__, __LINE__); - } - } - - return rc; -} diff --git a/test/vpto/cases/micro-op/vector-load-store/vsldb/compare.py b/test/vpto/cases/micro-op/vector-load-store/vsldb/compare.py deleted file mode 100755 index c47755b60..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vsldb/compare.py +++ /dev/null @@ -1,210 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/vector-load-store/vsldb -# family: vector-load-store -# target_ops: pto.vsldb -# scenarios: core-f32, full-mask, block-strided-load, block-mask -# NOTE: bulk-generated coverage skeleton. -# coding=utf-8 - -import os -import sys -import numpy as np - - -REPEAT_BYTES = 256 -PREFIX_ELEMS = 64 - - -def _ceil_div(x, y): - return (x + y - 1) // y - - -def _packed_pred_storage_bytes(logical_elems, src_elem_bytes): - logical_elems = int(logical_elems) - src_elem_bytes = int(src_elem_bytes) - if logical_elems <= 0: - raise ValueError(f"logical_elems must be > 0, got {logical_elems}") - if src_elem_bytes not in (1, 2, 4): - raise ValueError(f"unsupported packed predicate source size: {src_elem_bytes}") - - repeat_elems = REPEAT_BYTES // src_elem_bytes - if src_elem_bytes == 4: - repeat_times = _ceil_div(logical_elems, repeat_elems) + 1 - loop_count = repeat_times // 2 - return loop_count * 16 - - repeat_times = _ceil_div(logical_elems, repeat_elems) - return repeat_times * (repeat_elems // 8) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - g = golden.astype(np.int64, copy=False) - o = output.astype(np.int64, copy=False) - else: - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - abs_diff = np.abs(g - o) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={g[idx]}, out={o[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_bin_prefix(golden_path, output_path, dtype, eps, count): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - try: - count = int(count) - except Exception: - print(f"[ERROR] Invalid prefix count: {count}") - return False - if count <= 0: - print(f"[ERROR] Invalid prefix count: {count}") - return False - - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np, count=count) - output = np.fromfile(output_path, dtype=dtype_np, count=count) - - if golden.size != count or output.size != count: - print( - f"[ERROR] Prefix read too small: need={count} elems, " - f"golden={golden.size}, out={output.size}" - ) - return False - - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - g = golden.astype(np.int64, copy=False) - o = output.astype(np.int64, copy=False) - else: - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - abs_diff = np.abs(g - o) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch (prefix): {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={g[idx]}, out={o[idx]}, dtype={dtype_np}, count={count})" - ) - else: - print(f"[ERROR] Mismatch (prefix): {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, logical_elems, src_elem_bytes): - """ - Compare outputs of pto.tcmp / pto.tcmps. - - PTO-ISA stores packed predicate results as a linear PK byte stream via - `psts`, with the exact written prefix length determined by the typed - TCMP/TCMPS repeat schedule. Compare only that semantic prefix. - """ - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - try: - logical_elems = int(logical_elems) - src_elem_bytes = int(src_elem_bytes) - except Exception: - print( - "[ERROR] Invalid packed mask compare arguments: " - f"logical_elems={logical_elems} src_elem_bytes={src_elem_bytes}" - ) - return False - if logical_elems <= 0 or src_elem_bytes <= 0: - print( - "[ERROR] Invalid packed mask compare arguments: " - f"logical_elems={logical_elems} src_elem_bytes={src_elem_bytes}" - ) - return False - - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - try: - prefix_bytes = _packed_pred_storage_bytes(logical_elems, src_elem_bytes) - except ValueError as exc: - print(f"[ERROR] {exc}") - return False - - if golden.size < prefix_bytes or output.size < prefix_bytes: - print( - f"[ERROR] Packed mask buffer too small: need={prefix_bytes} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - - golden_sel = golden[:prefix_bytes] - output_sel = output[:prefix_bytes] - - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, " - f"logical_elems={logical_elems}, src_elem_bytes={src_elem_bytes}, prefix_bytes={prefix_bytes})" - ) - return False - return True - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = True - ok = compare_bin_prefix("golden_v2.bin", "v2.bin", np.float32, 0.0001, PREFIX_ELEMS) and ok - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/vector-load-store/vsldb/golden.py b/test/vpto/cases/micro-op/vector-load-store/vsldb/golden.py deleted file mode 100755 index 3d8dad137..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vsldb/golden.py +++ /dev/null @@ -1,62 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/vector-load-store/vsldb -# family: vector-load-store -# target_ops: pto.vsldb -# scenarios: core-f32, full-mask, block-strided-load, block-mask -# NOTE: bulk-generated coverage skeleton. -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -ROWS = 32 -COLS = 32 -SEED = 19 -BLOCK_STRIDE = 2 -REPEAT_STRIDE = 4 -BLOCK_ELEMS = 8 -BLOCK_COUNT = 8 - - -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) - v1 = rng.uniform(-8.0, 8.0, size=(ROWS, COLS)).astype(np.float32) - v2 = np.zeros((ROWS, COLS), dtype=np.float32) - golden_v2 = np.zeros((ROWS, COLS), dtype=np.float32) - flat_in = v1.reshape(-1) - flat_golden = golden_v2.reshape(-1) - for blk in range(BLOCK_COUNT): - src_blk = REPEAT_STRIDE + blk * BLOCK_STRIDE - flat_golden[blk * BLOCK_ELEMS:(blk + 1) * BLOCK_ELEMS] = flat_in[ - src_blk * BLOCK_ELEMS:(src_blk + 1) * BLOCK_ELEMS - ] - - output_dir.mkdir(parents=True, exist_ok=True) - v1.reshape(-1).tofile(output_dir / "v1.bin") - v2.reshape(-1).tofile(output_dir / "v2.bin") - golden_v2.reshape(-1).tofile(output_dir / "golden_v2.bin") - - -def main() -> None: - parser = argparse.ArgumentParser( - description="Generate numpy-based inputs/golden for VPTO micro-op vsldb validation." - ) - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/vector-load-store/vsldb/kernel.pto b/test/vpto/cases/micro-op/vector-load-store/vsldb/kernel.pto deleted file mode 100644 index 839f56139..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vsldb/kernel.pto +++ /dev/null @@ -1,47 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/vector-load-store/vsldb -// family: vector-load-store -// target_ops: pto.vsldb -// scenarios: core-f32, full-mask, block-strided-load, block-mask -// ----------------------------------------------------------------------------- -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vsldb_kernel_2d(%arg0: !pto.ptr, %arg1: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1_i64 = arith.constant 1 : i64 - %c0_i64 = arith.constant 0 : i64 - %c2_i16 = arith.constant 2 : i16 - %c4_i16 = arith.constant 4 : i16 - %c32_i64 = arith.constant 32 : i64 - %c64_i32 = arith.constant 64 : i32 - %c128_i64 = arith.constant 128 : i64 - %c4096_i64 = arith.constant 4096 : i64 - - %ub_in = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c4096_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_in, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - %c1 = arith.constant 1 : index - pto.vecscope { - scf.for %iv = %c0 to %c1 step %c1 { - %mask, %next_remaining = pto.plt_b32 %c64_i32 : i32 -> !pto.mask, i32 - %loaded = pto.vsldb %ub_in, %c2_i16, %c4_i16, %mask : !pto.ptr, i16, i16, !pto.mask -> !pto.vreg<64xf32> - pto.vsts %loaded, %ub_out[%c0], %mask : !pto.vreg<64xf32>, !pto.ptr, !pto.mask - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg1, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/vector-load-store/vsldb/launch.cpp b/test/vpto/cases/micro-op/vector-load-store/vsldb/launch.cpp deleted file mode 100644 index fe71cc2b6..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vsldb/launch.cpp +++ /dev/null @@ -1,70 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/vector-load-store/vsldb -// family: vector-load-store -// target_ops: pto.vsldb -// scenarios: core-f32, full-mask, block-strided-load, block-mask -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -// --------------------------------------------------------------------------- -// PTOAS compatibility layer -// -// The upstream pto-isa headers reference some FP8/FP4 types and the -// __VEC_SCOPE__ marker that are not available on every AICore arch/toolchain -// combination (e.g. __NPU_ARCH__==2201). -// -// For our PTOAS-generated kernels we don't rely on these types today, but the -// headers still mention them in templates/static_asserts. Provide minimal -// fallbacks to keep compilation working on dav-c220. -// --------------------------------------------------------------------------- -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include - -// AICore printf support is gated behind `--cce-enable-print` on some -// toolchains. When enabled, include the CCE print header so `cce::printf` -// resolves in device compilation. -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -// Some PTO-ISA types are only available in the __CCE_AICORE__ compilation -// path, but `bisheng -xcce` still performs a host-side parse pass. -// Provide minimal fallbacks only when the corresponding header wasn't -// pulled in by the selected arch implementation. -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vsldb_kernel_2d(__gm__ float *v1, - __gm__ float *v2); - -void LaunchVabs_kernel_2d(float *v1, float *v2, void *stream) { - vsldb_kernel_2d<<<1, nullptr, stream>>>((__gm__ float *)v1, (__gm__ float *)v2); -} diff --git a/test/vpto/cases/micro-op/vector-load-store/vsldb/main.cpp b/test/vpto/cases/micro-op/vector-load-store/vsldb/main.cpp deleted file mode 100644 index f0b21ff83..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vsldb/main.cpp +++ /dev/null @@ -1,130 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/vector-load-store/vsldb -// family: vector-load-store -// target_ops: pto.vsldb -// scenarios: core-f32, full-mask, block-strided-load, block-mask -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -This program is free software, you can redistribute it and/or modify it under the terms and conditions of -CANN Open Software License Agreement Version 2.0 (the "License"). -Please refer to the License for details. You may not use this file except in compliance with the License. -THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -See LICENSE in the root of the software repository for the full text of the License. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include -#include - -using namespace PtoTestCommon; - -#ifndef TMRGSORT_HPP -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, (int)_ret, __FILE__, __LINE__); \ - const char *_recent = aclGetRecentErrMsg(); \ - if (_recent != nullptr && _recent[0] != '\0') { \ - std::fprintf(stderr, "[ERROR] RecentErrMsg: %s\n", _recent); \ - } \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVabs_kernel_2d(float *v1, float *v2, void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(float); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(float); - float *v1Host = nullptr; - float *v1Device = nullptr; - float *v2Host = nullptr; - float *v2Device = nullptr; - - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { - deviceId = std::atoi(envDevice); - } - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVabs_kernel_2d(v1Device, v2Device, stream); - - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v2Host, fileSize_v2, v2Device, fileSize_v2, ACL_MEMCPY_DEVICE_TO_HOST)); - - WriteFile("./v2.bin", v2Host, fileSize_v2); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - if (stream != nullptr) { - const aclError _ret = aclrtDestroyStream(stream); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtDestroyStream(stream)", (int)_ret, __FILE__, __LINE__); - } - stream = nullptr; - } - if (deviceSet) { - const aclError _ret = aclrtResetDevice(deviceId); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtResetDevice(deviceId)", (int)_ret, __FILE__, __LINE__); - } - } - if (aclInited) { - const aclError _ret = aclFinalize(); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclFinalize()", (int)_ret, __FILE__, __LINE__); - } - } - - return rc; -} diff --git a/test/vpto/cases/micro-op/vector-load-store/vsstb-post-update/compare.py b/test/vpto/cases/micro-op/vector-load-store/vsstb-post-update/compare.py deleted file mode 100644 index f0e4cc2b9..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vsstb-post-update/compare.py +++ /dev/null @@ -1,47 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -import os -import sys - -import numpy as np - - -def main() -> None: - golden_path = "golden_output.bin" - output_path = "output.bin" - strict = os.getenv("COMPARE_STRICT", "1") != "0" - - if not os.path.exists(golden_path) or not os.path.exists(output_path): - print("[ERROR] missing golden_output.bin or output.bin") - sys.exit(2 if strict else 0) - - golden = np.fromfile(golden_path, dtype=np.float32) - output = np.fromfile(output_path, dtype=np.float32) - ok = golden.shape == output.shape and np.allclose( - golden, output, atol=0.0001, rtol=0.0001, equal_nan=True - ) - if not ok: - if golden.shape != output.shape: - print(f"[ERROR] shape mismatch: {golden.shape} vs {output.shape}") - elif golden.size: - diff = np.abs(golden.astype(np.float64) - output.astype(np.float64)) - idx = int(np.argmax(diff)) - print( - f"[ERROR] mismatch at idx={idx}: golden={golden[idx]} " - f"output={output[idx]} diff={diff[idx]}" - ) - if strict: - sys.exit(2) - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/vector-load-store/vsstb-post-update/golden.py b/test/vpto/cases/micro-op/vector-load-store/vsstb-post-update/golden.py deleted file mode 100644 index 5a6384a09..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vsstb-post-update/golden.py +++ /dev/null @@ -1,45 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -import argparse -from pathlib import Path - -import numpy as np - - -ELEMENTS = 1024 -SEED = 19 - - -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) - data = rng.uniform(-8.0, 8.0, size=(ELEMENTS,)).astype(np.float32) - output = np.zeros((ELEMENTS,), dtype=np.float32) - golden = np.array(data, copy=True) - golden[:32] = data[32:64] - golden[32:64] = 0.0 - - output_dir.mkdir(parents=True, exist_ok=True) - data.tofile(output_dir / "input.bin") - output.tofile(output_dir / "output.bin") - golden.tofile(output_dir / "golden_output.bin") - - -def main() -> None: - parser = argparse.ArgumentParser( - description="Generate inputs/golden for VPTO vsstb post-update validation." - ) - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/vector-load-store/vsstb-post-update/kernel.pto b/test/vpto/cases/micro-op/vector-load-store/vsstb-post-update/kernel.pto deleted file mode 100644 index 5bf887da1..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vsstb-post-update/kernel.pto +++ /dev/null @@ -1,52 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// case: micro-op/vector-load-store/vsstb-post-update -// family: vector-load-store -// target_ops: pto.vsstb -// scenarios: core-f32, full-mask, block-strided-store, post-update-result - -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vsstb_post_update_kernel(%input: !pto.ptr, %output: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c0_i64 = arith.constant 0 : i64 - %c2_i16 = arith.constant 2 : i16 - %c4_i16 = arith.constant 4 : i16 - %c32_i64 = arith.constant 32 : i64 - %c64_i32 = arith.constant 64 : i32 - %c128_i64 = arith.constant 128 : i64 - %c4096_i64 = arith.constant 4096 : i64 - - %ub_in = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c4096_i64 : i64 -> !pto.ptr - - pto.mte_gm_ub %input, %ub_in, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %mask, %next_remaining = pto.plt_b32 %c64_i32 : i32 -> !pto.mask, i32 - %value = pto.vlds %ub_in[%c0] : !pto.ptr -> !pto.vreg<64xf32> - %updated = pto.vsstb %value, %ub_out, %c2_i16, %c4_i16, %mask : !pto.vreg<64xf32>, !pto.ptr, i16, i16, !pto.mask -> !pto.ptr - pto.mem_bar "VST_VLD" - %roundtrip = pto.vsldb %updated, %c2_i16, %c4_i16, %mask : !pto.ptr, i16, i16, !pto.mask -> !pto.vreg<64xf32> - pto.vsts %roundtrip, %ub_in[%c0], %mask : !pto.vreg<64xf32>, !pto.ptr, !pto.mask - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_in, %output, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/vector-load-store/vsstb-post-update/launch.cpp b/test/vpto/cases/micro-op/vector-load-store/vsstb-post-update/launch.cpp deleted file mode 100644 index d3808f3d2..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vsstb-post-update/launch.cpp +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif - -#include - -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif - -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void -vsstb_post_update_kernel(__gm__ float *input, __gm__ float *output); - -void LaunchVsstbPostUpdate(float *input, float *output, void *stream) { - vsstb_post_update_kernel<<<1, nullptr, stream>>>((__gm__ float *)input, - (__gm__ float *)output); -} diff --git a/test/vpto/cases/micro-op/vector-load-store/vsstb-post-update/main.cpp b/test/vpto/cases/micro-op/vector-load-store/vsstb-post-update/main.cpp deleted file mode 100644 index ab64837a5..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vsstb-post-update/main.cpp +++ /dev/null @@ -1,103 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -#include "acl/acl.h" -#include "test_common.h" - -#include -#include - -using namespace PtoTestCommon; - -void LaunchVsstbPostUpdate(float *input, float *output, void *stream); - -namespace { -constexpr size_t kElementCount = 1024; -constexpr size_t kBufferSize = kElementCount * sizeof(float); -} - -#define ACL_CHECK(expr) \ - do { \ - const aclError ret = (expr); \ - if (ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, \ - static_cast(ret), __FILE__, __LINE__); \ - const char *recent = aclGetRecentErrMsg(); \ - if (recent != nullptr && recent[0] != '\0') \ - std::fprintf(stderr, "[ERROR] RecentErrMsg: %s\n", recent); \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -int main() { - float *inputHost = nullptr; - float *outputHost = nullptr; - float *inputDevice = nullptr; - float *outputDevice = nullptr; - aclrtStream stream = nullptr; - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - size_t inputSize = kBufferSize; - size_t outputSize = kBufferSize; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) - deviceId = std::atoi(envDevice); - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - - ACL_CHECK(aclrtMallocHost(reinterpret_cast(&inputHost), kBufferSize)); - ACL_CHECK(aclrtMallocHost(reinterpret_cast(&outputHost), kBufferSize)); - ACL_CHECK(aclrtMalloc(reinterpret_cast(&inputDevice), kBufferSize, - ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc(reinterpret_cast(&outputDevice), kBufferSize, - ACL_MEM_MALLOC_HUGE_FIRST)); - - ReadFile("./input.bin", inputSize, inputHost, kBufferSize); - ReadFile("./output.bin", outputSize, outputHost, kBufferSize); - ACL_CHECK(aclrtMemcpy(inputDevice, kBufferSize, inputHost, kBufferSize, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(outputDevice, kBufferSize, outputHost, kBufferSize, - ACL_MEMCPY_HOST_TO_DEVICE)); - - LaunchVsstbPostUpdate(inputDevice, outputDevice, stream); - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(outputHost, kBufferSize, outputDevice, kBufferSize, - ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./output.bin", outputHost, kBufferSize); - -cleanup: - aclrtFree(inputDevice); - aclrtFree(outputDevice); - aclrtFreeHost(inputHost); - aclrtFreeHost(outputHost); - if (stream != nullptr) { - const aclError ret = aclrtDestroyStream(stream); - if (ret != ACL_SUCCESS) - std::fprintf(stderr, "[ERROR] aclrtDestroyStream failed: %d\n", - static_cast(ret)); - } - if (deviceSet) { - const aclError ret = aclrtResetDevice(deviceId); - if (ret != ACL_SUCCESS) - std::fprintf(stderr, "[ERROR] aclrtResetDevice failed: %d\n", - static_cast(ret)); - } - if (aclInited) { - const aclError ret = aclFinalize(); - if (ret != ACL_SUCCESS) - std::fprintf(stderr, "[ERROR] aclFinalize failed: %d\n", - static_cast(ret)); - } - return rc; -} diff --git a/test/vpto/cases/micro-op/vector-load-store/vsstb/compare.py b/test/vpto/cases/micro-op/vector-load-store/vsstb/compare.py deleted file mode 100755 index cffa8ea8b..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vsstb/compare.py +++ /dev/null @@ -1,209 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/vector-load-store/vsstb -# family: vector-load-store -# target_ops: pto.vsstb -# scenarios: core-f32, full-mask, block-strided-store, block-mask -# NOTE: bulk-generated coverage skeleton. -# coding=utf-8 - -import os -import sys -import numpy as np - - -REPEAT_BYTES = 256 - - -def _ceil_div(x, y): - return (x + y - 1) // y - - -def _packed_pred_storage_bytes(logical_elems, src_elem_bytes): - logical_elems = int(logical_elems) - src_elem_bytes = int(src_elem_bytes) - if logical_elems <= 0: - raise ValueError(f"logical_elems must be > 0, got {logical_elems}") - if src_elem_bytes not in (1, 2, 4): - raise ValueError(f"unsupported packed predicate source size: {src_elem_bytes}") - - repeat_elems = REPEAT_BYTES // src_elem_bytes - if src_elem_bytes == 4: - repeat_times = _ceil_div(logical_elems, repeat_elems) + 1 - loop_count = repeat_times // 2 - return loop_count * 16 - - repeat_times = _ceil_div(logical_elems, repeat_elems) - return repeat_times * (repeat_elems // 8) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - g = golden.astype(np.int64, copy=False) - o = output.astype(np.int64, copy=False) - else: - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - abs_diff = np.abs(g - o) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={g[idx]}, out={o[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_bin_prefix(golden_path, output_path, dtype, eps, count): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - try: - count = int(count) - except Exception: - print(f"[ERROR] Invalid prefix count: {count}") - return False - if count <= 0: - print(f"[ERROR] Invalid prefix count: {count}") - return False - - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np, count=count) - output = np.fromfile(output_path, dtype=dtype_np, count=count) - - if golden.size != count or output.size != count: - print( - f"[ERROR] Prefix read too small: need={count} elems, " - f"golden={golden.size}, out={output.size}" - ) - return False - - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - g = golden.astype(np.int64, copy=False) - o = output.astype(np.int64, copy=False) - else: - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - abs_diff = np.abs(g - o) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch (prefix): {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={g[idx]}, out={o[idx]}, dtype={dtype_np}, count={count})" - ) - else: - print(f"[ERROR] Mismatch (prefix): {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, logical_elems, src_elem_bytes): - """ - Compare outputs of pto.tcmp / pto.tcmps. - - PTO-ISA stores packed predicate results as a linear PK byte stream via - `psts`, with the exact written prefix length determined by the typed - TCMP/TCMPS repeat schedule. Compare only that semantic prefix. - """ - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - try: - logical_elems = int(logical_elems) - src_elem_bytes = int(src_elem_bytes) - except Exception: - print( - "[ERROR] Invalid packed mask compare arguments: " - f"logical_elems={logical_elems} src_elem_bytes={src_elem_bytes}" - ) - return False - if logical_elems <= 0 or src_elem_bytes <= 0: - print( - "[ERROR] Invalid packed mask compare arguments: " - f"logical_elems={logical_elems} src_elem_bytes={src_elem_bytes}" - ) - return False - - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - try: - prefix_bytes = _packed_pred_storage_bytes(logical_elems, src_elem_bytes) - except ValueError as exc: - print(f"[ERROR] {exc}") - return False - - if golden.size < prefix_bytes or output.size < prefix_bytes: - print( - f"[ERROR] Packed mask buffer too small: need={prefix_bytes} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - - golden_sel = golden[:prefix_bytes] - output_sel = output[:prefix_bytes] - - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, " - f"logical_elems={logical_elems}, src_elem_bytes={src_elem_bytes}, prefix_bytes={prefix_bytes})" - ) - return False - return True - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = True - ok = compare_bin("golden_v2.bin", "v2.bin", np.float32, 0.0001) and ok - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/vector-load-store/vsstb/golden.py b/test/vpto/cases/micro-op/vector-load-store/vsstb/golden.py deleted file mode 100755 index 033a8b030..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vsstb/golden.py +++ /dev/null @@ -1,55 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/vector-load-store/vsstb -# family: vector-load-store -# target_ops: pto.vsstb -# scenarios: core-f32, full-mask, block-strided-store, block-mask -# NOTE: bulk-generated coverage skeleton. -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -ROWS = 32 -COLS = 32 -SEED = 19 -BLOCK_STRIDE = 2 -REPEAT_STRIDE = 4 -BLOCK_ELEMS = 8 -BLOCK_COUNT = 8 - - -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) - v1 = rng.uniform(-8.0, 8.0, size=(ROWS, COLS)).astype(np.float32) - v2 = np.zeros((ROWS, COLS), dtype=np.float32) - golden_v2 = np.array(v1, copy=True) - - output_dir.mkdir(parents=True, exist_ok=True) - v1.reshape(-1).tofile(output_dir / "v1.bin") - v2.reshape(-1).tofile(output_dir / "v2.bin") - golden_v2.reshape(-1).tofile(output_dir / "golden_v2.bin") - - -def main() -> None: - parser = argparse.ArgumentParser( - description="Generate numpy-based inputs/golden for VPTO micro-op vsstb validation." - ) - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/vector-load-store/vsstb/kernel.pto b/test/vpto/cases/micro-op/vector-load-store/vsstb/kernel.pto deleted file mode 100644 index df23d2b44..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vsstb/kernel.pto +++ /dev/null @@ -1,50 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/vector-load-store/vsstb -// family: vector-load-store -// target_ops: pto.vsstb -// scenarios: core-f32, full-mask, block-strided-store, block-mask -// ----------------------------------------------------------------------------- -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vsstb_kernel_2d(%arg0: !pto.ptr, %arg1: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1_i64 = arith.constant 1 : i64 - %c0_i64 = arith.constant 0 : i64 - %c2_i16 = arith.constant 2 : i16 - %c4_i16 = arith.constant 4 : i16 - %c32_i64 = arith.constant 32 : i64 - %c64_i32 = arith.constant 64 : i32 - %c128_i64 = arith.constant 128 : i64 - %c4096_i64 = arith.constant 4096 : i64 - - %ub_in = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c4096_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_in, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - %c1 = arith.constant 1 : index - pto.vecscope { - scf.for %iv = %c0 to %c1 step %c1 { - %mask, %next_remaining = pto.plt_b32 %c64_i32 : i32 -> !pto.mask, i32 - %value = pto.vlds %ub_in[%c0] : !pto.ptr -> !pto.vreg<64xf32> - pto.vsstb %value, %ub_out, %c2_i16, %c4_i16, %mask : !pto.vreg<64xf32>, !pto.ptr, i16, i16, !pto.mask - pto.mem_bar "VST_VLD" - %roundtrip = pto.vsldb %ub_out, %c2_i16, %c4_i16, %mask : !pto.ptr, i16, i16, !pto.mask -> !pto.vreg<64xf32> - pto.vsts %roundtrip, %ub_in[%c0], %mask : !pto.vreg<64xf32>, !pto.ptr, !pto.mask - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_in, %arg1, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/vector-load-store/vsstb/launch.cpp b/test/vpto/cases/micro-op/vector-load-store/vsstb/launch.cpp deleted file mode 100644 index 95d2a57bd..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vsstb/launch.cpp +++ /dev/null @@ -1,70 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/vector-load-store/vsstb -// family: vector-load-store -// target_ops: pto.vsstb -// scenarios: core-f32, full-mask, block-strided-store, block-mask -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -// --------------------------------------------------------------------------- -// PTOAS compatibility layer -// -// The upstream pto-isa headers reference some FP8/FP4 types and the -// __VEC_SCOPE__ marker that are not available on every AICore arch/toolchain -// combination (e.g. __NPU_ARCH__==2201). -// -// For our PTOAS-generated kernels we don't rely on these types today, but the -// headers still mention them in templates/static_asserts. Provide minimal -// fallbacks to keep compilation working on dav-c220. -// --------------------------------------------------------------------------- -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include - -// AICore printf support is gated behind `--cce-enable-print` on some -// toolchains. When enabled, include the CCE print header so `cce::printf` -// resolves in device compilation. -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -// Some PTO-ISA types are only available in the __CCE_AICORE__ compilation -// path, but `bisheng -xcce` still performs a host-side parse pass. -// Provide minimal fallbacks only when the corresponding header wasn't -// pulled in by the selected arch implementation. -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vsstb_kernel_2d(__gm__ float *v1, - __gm__ float *v2); - -void LaunchVabs_kernel_2d(float *v1, float *v2, void *stream) { - vsstb_kernel_2d<<<1, nullptr, stream>>>((__gm__ float *)v1, (__gm__ float *)v2); -} diff --git a/test/vpto/cases/micro-op/vector-load-store/vsstb/main.cpp b/test/vpto/cases/micro-op/vector-load-store/vsstb/main.cpp deleted file mode 100644 index 72c683928..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vsstb/main.cpp +++ /dev/null @@ -1,130 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/vector-load-store/vsstb -// family: vector-load-store -// target_ops: pto.vsstb -// scenarios: core-f32, full-mask, block-strided-store, block-mask -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -This program is free software, you can redistribute it and/or modify it under the terms and conditions of -CANN Open Software License Agreement Version 2.0 (the "License"). -Please refer to the License for details. You may not use this file except in compliance with the License. -THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -See LICENSE in the root of the software repository for the full text of the License. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include -#include - -using namespace PtoTestCommon; - -#ifndef TMRGSORT_HPP -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, (int)_ret, __FILE__, __LINE__); \ - const char *_recent = aclGetRecentErrMsg(); \ - if (_recent != nullptr && _recent[0] != '\0') { \ - std::fprintf(stderr, "[ERROR] RecentErrMsg: %s\n", _recent); \ - } \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVabs_kernel_2d(float *v1, float *v2, void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(float); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(float); - float *v1Host = nullptr; - float *v1Device = nullptr; - float *v2Host = nullptr; - float *v2Device = nullptr; - - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { - deviceId = std::atoi(envDevice); - } - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVabs_kernel_2d(v1Device, v2Device, stream); - - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v2Host, fileSize_v2, v2Device, fileSize_v2, ACL_MEMCPY_DEVICE_TO_HOST)); - - WriteFile("./v2.bin", v2Host, fileSize_v2); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - if (stream != nullptr) { - const aclError _ret = aclrtDestroyStream(stream); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtDestroyStream(stream)", (int)_ret, __FILE__, __LINE__); - } - stream = nullptr; - } - if (deviceSet) { - const aclError _ret = aclrtResetDevice(deviceId); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtResetDevice(deviceId)", (int)_ret, __FILE__, __LINE__); - } - } - if (aclInited) { - const aclError _ret = aclFinalize(); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclFinalize()", (int)_ret, __FILE__, __LINE__); - } - } - - return rc; -} diff --git a/test/vpto/cases/micro-op/vector-load-store/vstar/compare.py b/test/vpto/cases/micro-op/vector-load-store/vstar/compare.py deleted file mode 100755 index 3f233f6e6..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vstar/compare.py +++ /dev/null @@ -1,236 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/vector-load-store/vstar -# family: vector-load-store -# target_ops: pto.vstar -# scenarios: core-f32, full-mask, aligned, state-update -# NOTE: bulk-generated coverage skeleton. -# coding=utf-8 - -import os -import sys -import numpy as np - - -REPEAT_BYTES = 256 -CHECK_OFFSET = 1 -CHECK_COUNT = 8 - - -def _ceil_div(x, y): - return (x + y - 1) // y - - -def _packed_pred_storage_bytes(logical_elems, src_elem_bytes): - logical_elems = int(logical_elems) - src_elem_bytes = int(src_elem_bytes) - if logical_elems <= 0: - raise ValueError(f"logical_elems must be > 0, got {logical_elems}") - if src_elem_bytes not in (1, 2, 4): - raise ValueError(f"unsupported packed predicate source size: {src_elem_bytes}") - - repeat_elems = REPEAT_BYTES // src_elem_bytes - if src_elem_bytes == 4: - repeat_times = _ceil_div(logical_elems, repeat_elems) + 1 - loop_count = repeat_times // 2 - return loop_count * 16 - - repeat_times = _ceil_div(logical_elems, repeat_elems) - return repeat_times * (repeat_elems // 8) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - g = golden.astype(np.int64, copy=False) - o = output.astype(np.int64, copy=False) - else: - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - abs_diff = np.abs(g - o) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={g[idx]}, out={o[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_bin_prefix(golden_path, output_path, dtype, eps, count): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - try: - count = int(count) - except Exception: - print(f"[ERROR] Invalid prefix count: {count}") - return False - if count <= 0: - print(f"[ERROR] Invalid prefix count: {count}") - return False - - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np, count=count) - output = np.fromfile(output_path, dtype=dtype_np, count=count) - - if golden.size != count or output.size != count: - print( - f"[ERROR] Prefix read too small: need={count} elems, " - f"golden={golden.size}, out={output.size}" - ) - return False - - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - g = golden.astype(np.int64, copy=False) - o = output.astype(np.int64, copy=False) - else: - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - abs_diff = np.abs(g - o) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch (prefix): {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={g[idx]}, out={o[idx]}, dtype={dtype_np}, count={count})" - ) - else: - print(f"[ERROR] Mismatch (prefix): {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, logical_elems, src_elem_bytes): - """ - Compare outputs of pto.tcmp / pto.tcmps. - - PTO-ISA stores packed predicate results as a linear PK byte stream via - `psts`, with the exact written prefix length determined by the typed - TCMP/TCMPS repeat schedule. Compare only that semantic prefix. - """ - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - try: - logical_elems = int(logical_elems) - src_elem_bytes = int(src_elem_bytes) - except Exception: - print( - "[ERROR] Invalid packed mask compare arguments: " - f"logical_elems={logical_elems} src_elem_bytes={src_elem_bytes}" - ) - return False - if logical_elems <= 0 or src_elem_bytes <= 0: - print( - "[ERROR] Invalid packed mask compare arguments: " - f"logical_elems={logical_elems} src_elem_bytes={src_elem_bytes}" - ) - return False - - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - try: - prefix_bytes = _packed_pred_storage_bytes(logical_elems, src_elem_bytes) - except ValueError as exc: - print(f"[ERROR] {exc}") - return False - - if golden.size < prefix_bytes or output.size < prefix_bytes: - print( - f"[ERROR] Packed mask buffer too small: need={prefix_bytes} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - - golden_sel = golden[:prefix_bytes] - output_sel = output[:prefix_bytes] - - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, " - f"logical_elems={logical_elems}, src_elem_bytes={src_elem_bytes}, prefix_bytes={prefix_bytes})" - ) - return False - return True - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = True - golden = np.fromfile("golden_v2.bin", dtype=np.float32) if os.path.exists("golden_v2.bin") else None - output = np.fromfile("v2.bin", dtype=np.float32) if os.path.exists("v2.bin") else None - lo = CHECK_OFFSET - hi = CHECK_OFFSET + CHECK_COUNT - if output is None: - ok = False - print("[ERROR] Output missing: v2.bin") - elif golden is None: - ok = False - print("[ERROR] Golden missing: golden_v2.bin") - elif golden.size < hi or output.size < hi: - ok = False - print( - f"[ERROR] Flush slice too small: need={hi} elems, " - f"golden={golden.size}, out={output.size}" - ) - elif not np.allclose(golden[lo:hi], output[lo:hi], atol=0.0001, rtol=0.0001, equal_nan=True): - g = golden[lo:hi].astype(np.float64, copy=False) - o = output[lo:hi].astype(np.float64, copy=False) - abs_diff = np.abs(g - o) - idx = int(np.argmax(abs_diff)) - ok = False - print( - f"[ERROR] Mismatch (flush slice): golden_v2.bin vs v2.bin, max diff={float(abs_diff[idx])} " - f"at idx={lo + idx} (golden={g[idx]}, out={o[idx]}, dtype=float32)" - ) - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/vector-load-store/vstar/golden.py b/test/vpto/cases/micro-op/vector-load-store/vstar/golden.py deleted file mode 100755 index d1a1054ba..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vstar/golden.py +++ /dev/null @@ -1,51 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/vector-load-store/vstar -# family: vector-load-store -# target_ops: pto.vstar -# scenarios: core-f32, predicate-squeezed, unaligned, state-update -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -ROWS = 32 -COLS = 32 -SEED = 19 - - -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) - v1 = rng.uniform(-8.0, 8.0, size=(ROWS, COLS)).astype(np.float32) - v2 = np.zeros((ROWS, COLS), dtype=np.float32) - golden_v2 = np.zeros((ROWS, COLS), dtype=np.float32) - golden_v2.reshape(-1)[1:9] = v1.reshape(-1)[:8] - - output_dir.mkdir(parents=True, exist_ok=True) - v1.reshape(-1).tofile(output_dir / "v1.bin") - v2.reshape(-1).tofile(output_dir / "v2.bin") - golden_v2.reshape(-1).tofile(output_dir / "golden_v2.bin") - - -def main() -> None: - parser = argparse.ArgumentParser( - description="Generate numpy-based inputs/golden for VPTO micro-op vstar validation." - ) - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/vector-load-store/vstar/kernel.pto b/test/vpto/cases/micro-op/vector-load-store/vstar/kernel.pto deleted file mode 100644 index 8fa41490c..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vstar/kernel.pto +++ /dev/null @@ -1,63 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/vector-load-store/vstar -// family: vector-load-store -// target_ops: pto.vstar -// scenarios: core-f32, predicate-squeezed, unaligned, state-update -// ----------------------------------------------------------------------------- -// Validate the final flush step of a stateful store chain. -// The case keeps `pto.vstar` as the target op and uses the minimal required -// setup: -// 1. load one aligned vector from `%ub_in` -// 2. squeeze a small active prefix to prime `SPR SQZN` -// 3. prime one store-state carrier from unaligned `%ub_out` -// 4. issue one `pto.vstur ... "POST_UPDATE"` to create residual state -// 5. flush that residual state with `pto.vstar` -// This makes the observable payload come from `vstar` while keeping the chain -// contract valid per docs. - -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vstar_kernel_2d(%arg0: !pto.ptr, %arg1: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c8_i32 = arith.constant 8 : i32 - %c32_i64 = arith.constant 32 : i64 - %c128_i64 = arith.constant 128 : i64 - %c4096_i64 = arith.constant 4096 : i64 - %c1_elem = arith.constant 1 : index - - %ub_in = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c4096_i64 : i64 -> !pto.ptr - %ub_out1 = pto.addptr %ub_out, %c1_elem : !pto.ptr -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_in, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - pto.sprclr "AR" - scf.for %iter = %c0 to %c1 step %c1 { - %vec = pto.vlds %ub_in[%c0] : !pto.ptr -> !pto.vreg<64xf32> - %mask, %unused = pto.plt_b32 %c8_i32 : i32 -> !pto.mask, i32 - %sqz = pto.vsqz %vec, %mask : !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> - %align0 = pto.init_align : !pto.align - %align1 = pto.vstur %align0, %sqz, %ub_out1, "POST_UPDATE" - : !pto.align, !pto.vreg<64xf32>, !pto.ptr -> !pto.align - pto.vstar %align1, %ub_out1 : !pto.align, !pto.ptr - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg1, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/vector-load-store/vstar/launch.cpp b/test/vpto/cases/micro-op/vector-load-store/vstar/launch.cpp deleted file mode 100644 index 6d4789d7e..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vstar/launch.cpp +++ /dev/null @@ -1,70 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/vector-load-store/vstar -// family: vector-load-store -// target_ops: pto.vstar -// scenarios: core-f32, full-mask, aligned, state-update -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -// --------------------------------------------------------------------------- -// PTOAS compatibility layer -// -// The upstream pto-isa headers reference some FP8/FP4 types and the -// __VEC_SCOPE__ marker that are not available on every AICore arch/toolchain -// combination (e.g. __NPU_ARCH__==2201). -// -// For our PTOAS-generated kernels we don't rely on these types today, but the -// headers still mention them in templates/static_asserts. Provide minimal -// fallbacks to keep compilation working on dav-c220. -// --------------------------------------------------------------------------- -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include - -// AICore printf support is gated behind `--cce-enable-print` on some -// toolchains. When enabled, include the CCE print header so `cce::printf` -// resolves in device compilation. -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -// Some PTO-ISA types are only available in the __CCE_AICORE__ compilation -// path, but `bisheng -xcce` still performs a host-side parse pass. -// Provide minimal fallbacks only when the corresponding header wasn't -// pulled in by the selected arch implementation. -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vstar_kernel_2d(__gm__ float *v1, - __gm__ float *v2); - -void LaunchVstar_kernel_2d(float *v1, float *v2, void *stream) { - vstar_kernel_2d<<<1, nullptr, stream>>>((__gm__ float *)v1, (__gm__ float *)v2); -} diff --git a/test/vpto/cases/micro-op/vector-load-store/vstar/main.cpp b/test/vpto/cases/micro-op/vector-load-store/vstar/main.cpp deleted file mode 100644 index 0f316a695..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vstar/main.cpp +++ /dev/null @@ -1,130 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/vector-load-store/vstar -// family: vector-load-store -// target_ops: pto.vstar -// scenarios: core-f32, full-mask, aligned, state-update -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -This program is free software, you can redistribute it and/or modify it under the terms and conditions of -CANN Open Software License Agreement Version 2.0 (the "License"). -Please refer to the License for details. You may not use this file except in compliance with the License. -THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -See LICENSE in the root of the software repository for the full text of the License. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include -#include - -using namespace PtoTestCommon; - -#ifndef TMRGSORT_HPP -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, (int)_ret, __FILE__, __LINE__); \ - const char *_recent = aclGetRecentErrMsg(); \ - if (_recent != nullptr && _recent[0] != '\0') { \ - std::fprintf(stderr, "[ERROR] RecentErrMsg: %s\n", _recent); \ - } \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVstar_kernel_2d(float *v1, float *v2, void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(float); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(float); - float *v1Host = nullptr; - float *v1Device = nullptr; - float *v2Host = nullptr; - float *v2Device = nullptr; - - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { - deviceId = std::atoi(envDevice); - } - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVstar_kernel_2d(v1Device, v2Device, stream); - - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v2Host, fileSize_v2, v2Device, fileSize_v2, ACL_MEMCPY_DEVICE_TO_HOST)); - - WriteFile("./v2.bin", v2Host, fileSize_v2); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - if (stream != nullptr) { - const aclError _ret = aclrtDestroyStream(stream); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtDestroyStream(stream)", (int)_ret, __FILE__, __LINE__); - } - stream = nullptr; - } - if (deviceSet) { - const aclError _ret = aclrtResetDevice(deviceId); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtResetDevice(deviceId)", (int)_ret, __FILE__, __LINE__); - } - } - if (aclInited) { - const aclError _ret = aclFinalize(); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclFinalize()", (int)_ret, __FILE__, __LINE__); - } - } - - return rc; -} diff --git a/test/vpto/cases/micro-op/vector-load-store/vstas-vstus-offset-update/compare.py b/test/vpto/cases/micro-op/vector-load-store/vstas-vstus-offset-update/compare.py deleted file mode 100755 index 0916b067e..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vstas-vstus-offset-update/compare.py +++ /dev/null @@ -1,208 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/vector-load-store/vstas-vstus-offset-update -# family: vector-load-store -# target_ops: pto.vstas, pto.vstus -# scenarios: core-f32, full-mask, immediate-offset, state-update -# coding=utf-8 - -import os -import sys -import numpy as np - - -REPEAT_BYTES = 256 - - -def _ceil_div(x, y): - return (x + y - 1) // y - - -def _packed_pred_storage_bytes(logical_elems, src_elem_bytes): - logical_elems = int(logical_elems) - src_elem_bytes = int(src_elem_bytes) - if logical_elems <= 0: - raise ValueError(f"logical_elems must be > 0, got {logical_elems}") - if src_elem_bytes not in (1, 2, 4): - raise ValueError(f"unsupported packed predicate source size: {src_elem_bytes}") - - repeat_elems = REPEAT_BYTES // src_elem_bytes - if src_elem_bytes == 4: - repeat_times = _ceil_div(logical_elems, repeat_elems) + 1 - loop_count = repeat_times // 2 - return loop_count * 16 - - repeat_times = _ceil_div(logical_elems, repeat_elems) - return repeat_times * (repeat_elems // 8) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - g = golden.astype(np.int64, copy=False) - o = output.astype(np.int64, copy=False) - else: - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - abs_diff = np.abs(g - o) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={g[idx]}, out={o[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_bin_prefix(golden_path, output_path, dtype, eps, count): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - try: - count = int(count) - except Exception: - print(f"[ERROR] Invalid prefix count: {count}") - return False - if count <= 0: - print(f"[ERROR] Invalid prefix count: {count}") - return False - - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np, count=count) - output = np.fromfile(output_path, dtype=dtype_np, count=count) - - if golden.size != count or output.size != count: - print( - f"[ERROR] Prefix read too small: need={count} elems, " - f"golden={golden.size}, out={output.size}" - ) - return False - - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - g = golden.astype(np.int64, copy=False) - o = output.astype(np.int64, copy=False) - else: - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - abs_diff = np.abs(g - o) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch (prefix): {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={g[idx]}, out={o[idx]}, dtype={dtype_np}, count={count})" - ) - else: - print(f"[ERROR] Mismatch (prefix): {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, logical_elems, src_elem_bytes): - """ - Compare outputs of pto.tcmp / pto.tcmps. - - PTO-ISA stores packed predicate results as a linear PK byte stream via - `psts`, with the exact written prefix length determined by the typed - TCMP/TCMPS repeat schedule. Compare only that semantic prefix. - """ - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - try: - logical_elems = int(logical_elems) - src_elem_bytes = int(src_elem_bytes) - except Exception: - print( - "[ERROR] Invalid packed mask compare arguments: " - f"logical_elems={logical_elems} src_elem_bytes={src_elem_bytes}" - ) - return False - if logical_elems <= 0 or src_elem_bytes <= 0: - print( - "[ERROR] Invalid packed mask compare arguments: " - f"logical_elems={logical_elems} src_elem_bytes={src_elem_bytes}" - ) - return False - - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - try: - prefix_bytes = _packed_pred_storage_bytes(logical_elems, src_elem_bytes) - except ValueError as exc: - print(f"[ERROR] {exc}") - return False - - if golden.size < prefix_bytes or output.size < prefix_bytes: - print( - f"[ERROR] Packed mask buffer too small: need={prefix_bytes} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - - golden_sel = golden[:prefix_bytes] - output_sel = output[:prefix_bytes] - - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, " - f"logical_elems={logical_elems}, src_elem_bytes={src_elem_bytes}, prefix_bytes={prefix_bytes})" - ) - return False - return True - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = True - ok = compare_bin_prefix("golden_v2.bin", "v2.bin", np.float32, 0.0001, 69) and ok - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/vector-load-store/vstas-vstus-offset-update/golden.py b/test/vpto/cases/micro-op/vector-load-store/vstas-vstus-offset-update/golden.py deleted file mode 100755 index b1b68f800..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vstas-vstus-offset-update/golden.py +++ /dev/null @@ -1,52 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/vector-load-store/vstas-vstus-offset-update -# family: vector-load-store -# target_ops: pto.vstas, pto.vstus -# scenarios: core-f32, full-mask, immediate-offset, state-update -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -ELEMENTS = 1024 -VECTOR_LANES = 64 -POST_UPDATE_OFFSET_ELEMENTS = 3 -SEED = 19 - - -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) - v1 = rng.uniform(-8.0, 8.0, size=(ELEMENTS,)).astype(np.float32) - v2 = np.zeros((ELEMENTS,), dtype=np.float32) - golden_v2 = np.zeros((ELEMENTS,), dtype=np.float32) - golden_v2[:POST_UPDATE_OFFSET_ELEMENTS] = v1[:POST_UPDATE_OFFSET_ELEMENTS] - - output_dir.mkdir(parents=True, exist_ok=True) - v1.tofile(output_dir / "v1.bin") - v2.tofile(output_dir / "v2.bin") - golden_v2.tofile(output_dir / "golden_v2.bin") - - -def main() -> None: - parser = argparse.ArgumentParser( - description="Generate numpy-based inputs/golden for VPTO micro-op vstas/vstus chain validation." - ) - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/vector-load-store/vstas-vstus-offset-update/kernel.pto b/test/vpto/cases/micro-op/vector-load-store/vstas-vstus-offset-update/kernel.pto deleted file mode 100644 index ac04a7d26..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vstas-vstus-offset-update/kernel.pto +++ /dev/null @@ -1,61 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/vector-load-store/vstas-vstus-offset-update -// family: vector-load-store -// target_ops: pto.vstas, pto.vstus -// scenarios: core-f32, full-mask, immediate-offset, state-update -// ----------------------------------------------------------------------------- -// Validate the state chain required by the plan: -// 1. prime a store-state carrier -// 2. issue one no-post `vstus` with a non-zero explicit offset -// 3. flush the residual state with `vstas` using the same explicit flush point -// The observable effect should match an unaligned store stream where `vstus` -// advances the stream by 3 f32 elements and leaves the buffered tail in -// `!pto.align`, then `vstas` commits that pending tail at the matching flush -// point identified by the original base plus the same scalar offset. - -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vstas_vstus_offset_update_kernel_2d(%arg0: !pto.ptr, %arg1: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c64 = arith.constant 64 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c128_i64 = arith.constant 128 : i64 - %c4096_i64 = arith.constant 4096 : i64 - %c3_i32 = arith.constant 3 : i32 - %c0_i32 = arith.constant 0 : i32 - - %ub_in = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c4096_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_in, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - pto.mte_gm_ub %arg1, %ub_out, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - scf.for %offset = %c0 to %c64 step %c64 { - %align0 = pto.init_align : !pto.align - %vec = pto.vlds %ub_in[%c0] : !pto.ptr -> !pto.vreg<64xf32> - %align1 = pto.vstus %align0, %c3_i32, %vec, %ub_out - : !pto.align, i32, !pto.vreg<64xf32>, !pto.ptr -> !pto.align - pto.vstas %align1, %ub_out, %c3_i32 : !pto.align, !pto.ptr, i32 - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg1, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/vector-load-store/vstas-vstus-offset-update/launch.cpp b/test/vpto/cases/micro-op/vector-load-store/vstas-vstus-offset-update/launch.cpp deleted file mode 100644 index b395937e5..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vstas-vstus-offset-update/launch.cpp +++ /dev/null @@ -1,70 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/vector-load-store/vstas-vstus-offset-update -// family: vector-load-store -// target_ops: pto.vstas, pto.vstus -// scenarios: core-f32, full-mask, immediate-offset, state-update -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -// --------------------------------------------------------------------------- -// PTOAS compatibility layer -// -// The upstream pto-isa headers reference some FP8/FP4 types and the -// __VEC_SCOPE__ marker that are not available on every AICore arch/toolchain -// combination (e.g. __NPU_ARCH__==2201). -// -// For our PTOAS-generated kernels we don't rely on these types today, but the -// headers still mention them in templates/static_asserts. Provide minimal -// fallbacks to keep compilation working on dav-c220. -// --------------------------------------------------------------------------- -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include - -// AICore printf support is gated behind `--cce-enable-print` on some -// toolchains. When enabled, include the CCE print header so `cce::printf` -// resolves in device compilation. -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -// Some PTO-ISA types are only available in the __CCE_AICORE__ compilation -// path, but `bisheng -xcce` still performs a host-side parse pass. -// Provide minimal fallbacks only when the corresponding header wasn't -// pulled in by the selected arch implementation. -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vstas_vstus_offset_update_kernel_2d(__gm__ float *v1, - __gm__ float *v2); - -void LaunchVstasVstusOffsetUpdate_kernel_2d(float *v1, float *v2, void *stream) { - vstas_vstus_offset_update_kernel_2d<<<1, nullptr, stream>>>((__gm__ float *)v1, (__gm__ float *)v2); -} diff --git a/test/vpto/cases/micro-op/vector-load-store/vstas-vstus-offset-update/main.cpp b/test/vpto/cases/micro-op/vector-load-store/vstas-vstus-offset-update/main.cpp deleted file mode 100644 index 2d2a9469b..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vstas-vstus-offset-update/main.cpp +++ /dev/null @@ -1,130 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/vector-load-store/vstas-vstus-offset-update -// family: vector-load-store -// target_ops: pto.vstas, pto.vstus -// scenarios: core-f32, full-mask, immediate-offset, state-update -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -This program is free software, you can redistribute it and/or modify it under the terms and conditions of -CANN Open Software License Agreement Version 2.0 (the "License"). -Please refer to the License for details. You may not use this file except in compliance with the License. -THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -See LICENSE in the root of the software repository for the full text of the License. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include -#include - -using namespace PtoTestCommon; - -#ifndef TMRGSORT_HPP -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, (int)_ret, __FILE__, __LINE__); \ - const char *_recent = aclGetRecentErrMsg(); \ - if (_recent != nullptr && _recent[0] != '\0') { \ - std::fprintf(stderr, "[ERROR] RecentErrMsg: %s\n", _recent); \ - } \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVstasVstusOffsetUpdate_kernel_2d(float *v1, float *v2, void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(float); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(float); - float *v1Host = nullptr; - float *v1Device = nullptr; - float *v2Host = nullptr; - float *v2Device = nullptr; - - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { - deviceId = std::atoi(envDevice); - } - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVstasVstusOffsetUpdate_kernel_2d(v1Device, v2Device, stream); - - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v2Host, fileSize_v2, v2Device, fileSize_v2, ACL_MEMCPY_DEVICE_TO_HOST)); - - WriteFile("./v2.bin", v2Host, fileSize_v2); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - if (stream != nullptr) { - const aclError _ret = aclrtDestroyStream(stream); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtDestroyStream(stream)", (int)_ret, __FILE__, __LINE__); - } - stream = nullptr; - } - if (deviceSet) { - const aclError _ret = aclrtResetDevice(deviceId); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtResetDevice(deviceId)", (int)_ret, __FILE__, __LINE__); - } - } - if (aclInited) { - const aclError _ret = aclFinalize(); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclFinalize()", (int)_ret, __FILE__, __LINE__); - } - } - - return rc; -} diff --git a/test/vpto/cases/micro-op/vector-load-store/vsts-1pt-b16/compare.py b/test/vpto/cases/micro-op/vector-load-store/vsts-1pt-b16/compare.py deleted file mode 100755 index d6d773550..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vsts-1pt-b16/compare.py +++ /dev/null @@ -1,258 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/vector-load-store/vsts-1pt-b16 -# family: vector-load-store -# target_ops: pto.vsts -# scenarios: core-i16, full-mask, aligned, dist-1pt-b16 -# coding=utf-8 - -import os -import sys -import numpy as np - - -REPEAT_BYTES = 256 -ACTIVE_ELEMS = 1024 -LANES = 128 - - -def _ceil_div(x, y): - return (x + y - 1) // y - - -def _packed_pred_storage_bytes(logical_elems, src_elem_bytes): - logical_elems = int(logical_elems) - src_elem_bytes = int(src_elem_bytes) - if logical_elems <= 0: - raise ValueError(f"logical_elems must be > 0, got {logical_elems}") - if src_elem_bytes not in (1, 2, 4): - raise ValueError(f"unsupported packed predicate source size: {src_elem_bytes}") - - repeat_elems = REPEAT_BYTES // src_elem_bytes - if src_elem_bytes == 4: - repeat_times = _ceil_div(logical_elems, repeat_elems) + 1 - loop_count = repeat_times // 2 - return loop_count * 16 - - repeat_times = _ceil_div(logical_elems, repeat_elems) - return repeat_times * (repeat_elems // 8) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - g = golden.astype(np.int64, copy=False) - o = output.astype(np.int64, copy=False) - else: - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - abs_diff = np.abs(g - o) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={g[idx]}, out={o[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_bin_prefix(golden_path, output_path, dtype, eps, count): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - try: - count = int(count) - except Exception: - print(f"[ERROR] Invalid prefix count: {count}") - return False - if count <= 0: - print(f"[ERROR] Invalid prefix count: {count}") - return False - - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np, count=count) - output = np.fromfile(output_path, dtype=dtype_np, count=count) - - if golden.size != count or output.size != count: - print( - f"[ERROR] Prefix read too small: need={count} elems, " - f"golden={golden.size}, out={output.size}" - ) - return False - - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - g = golden.astype(np.int64, copy=False) - o = output.astype(np.int64, copy=False) - else: - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - abs_diff = np.abs(g - o) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch (prefix): {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={g[idx]}, out={o[idx]}, dtype={dtype_np}, count={count})" - ) - else: - print(f"[ERROR] Mismatch (prefix): {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, logical_elems, src_elem_bytes): - """ - Compare outputs of pto.tcmp / pto.tcmps. - - PTO-ISA stores packed predicate results as a linear PK byte stream via - `psts`, with the exact written prefix length determined by the typed - TCMP/TCMPS repeat schedule. Compare only that semantic prefix. - """ - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - try: - logical_elems = int(logical_elems) - src_elem_bytes = int(src_elem_bytes) - except Exception: - print( - "[ERROR] Invalid packed mask compare arguments: " - f"logical_elems={logical_elems} src_elem_bytes={src_elem_bytes}" - ) - return False - if logical_elems <= 0 or src_elem_bytes <= 0: - print( - "[ERROR] Invalid packed mask compare arguments: " - f"logical_elems={logical_elems} src_elem_bytes={src_elem_bytes}" - ) - return False - - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - try: - prefix_bytes = _packed_pred_storage_bytes(logical_elems, src_elem_bytes) - except ValueError as exc: - print(f"[ERROR] {exc}") - return False - - if golden.size < prefix_bytes or output.size < prefix_bytes: - print( - f"[ERROR] Packed mask buffer too small: need={prefix_bytes} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - - golden_sel = golden[:prefix_bytes] - output_sel = output[:prefix_bytes] - - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, " - f"logical_elems={logical_elems}, src_elem_bytes={src_elem_bytes}, prefix_bytes={prefix_bytes})" - ) - return False - return True - - -def compare_1pt_positions(golden_path, output_path, dtype, active_elems, lanes): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - try: - active_elems = int(active_elems) - lanes = int(lanes) - except Exception: - print(f"[ERROR] Invalid 1PT compare arguments: active_elems={active_elems} lanes={lanes}") - return False - if active_elems <= 0 or lanes <= 0: - print(f"[ERROR] Invalid 1PT compare arguments: active_elems={active_elems} lanes={lanes}") - return False - - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden.shape} vs {output.shape}") - return False - - positions = np.arange(0, active_elems, lanes, dtype=np.int64) - if positions.size == 0: - print("[ERROR] No 1PT positions selected") - return False - if positions[-1] >= golden.size: - print( - f"[ERROR] 1PT positions out of range: last={int(positions[-1])} size={golden.size}" - ) - return False - - golden_sel = golden[positions] - output_sel = output[positions] - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - pos = int(positions[idx]) - print( - f"[ERROR] Mismatch (1PT positions): idx={pos} " - f"golden={int(golden_sel[idx])} out={int(output_sel[idx])}" - ) - return False - return True - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = True - ok = compare_1pt_positions("golden_v2.bin", "v2.bin", np.int16, ACTIVE_ELEMS, LANES) and ok - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/vector-load-store/vsts-1pt-b16/golden.py b/test/vpto/cases/micro-op/vector-load-store/vsts-1pt-b16/golden.py deleted file mode 100755 index 1ed2947c7..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vsts-1pt-b16/golden.py +++ /dev/null @@ -1,53 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/vector-load-store/vsts-1pt-b16 -# family: vector-load-store -# target_ops: pto.vsts -# scenarios: core-i16, full-mask, aligned, dist-1pt-b16 -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -ELEMENTS = 2048 -ACTIVE_ELEMS = 1024 -LANES = 128 -SEED = 19 - - -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) - v1 = rng.integers(-(2**15), 2**15, size=(ELEMENTS,), dtype=np.int16) - v2 = np.zeros((ELEMENTS,), dtype=np.int16) - golden_v2 = np.zeros((ELEMENTS,), dtype=np.int16) - for offset in range(0, ACTIVE_ELEMS, LANES): - golden_v2[offset] = v1[offset] - - output_dir.mkdir(parents=True, exist_ok=True) - v1.tofile(output_dir / "v1.bin") - v2.tofile(output_dir / "v2.bin") - golden_v2.tofile(output_dir / "golden_v2.bin") - - -def main() -> None: - parser = argparse.ArgumentParser( - description="Generate numpy-based inputs/golden for VPTO micro-op vsts 1PT validation." - ) - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/vector-load-store/vsts-1pt-b16/kernel.pto b/test/vpto/cases/micro-op/vector-load-store/vsts-1pt-b16/kernel.pto deleted file mode 100644 index 8bf7ce3f9..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vsts-1pt-b16/kernel.pto +++ /dev/null @@ -1,50 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/vector-load-store/vsts-1pt-b16 -// family: vector-load-store -// target_ops: pto.vsts -// scenarios: core-i16, full-mask, aligned, dist-1pt-b16 -// ----------------------------------------------------------------------------- -// Validate one representative `1PT_B16` store distribution on `b16`. - -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vsts_1pt_b16_kernel(%arg0: !pto.ptr, %arg1: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c128 = arith.constant 128 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c128_i64 = arith.constant 128 : i64 - %c4096_i64 = arith.constant 4096 : i64 - %c1024_i32 = arith.constant 1024 : i32 - - %ub_in = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c4096_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_in, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %_:1 = scf.for %offset = %c0 to %c1024 step %c128 iter_args(%remaining = %c1024_i32) -> (i32) { - %mask, %next_remaining = pto.plt_b16 %remaining : i32 -> !pto.mask, i32 - %vec = pto.vlds %ub_in[%offset] : !pto.ptr -> !pto.vreg<128xi16> - pto.vsts %vec, %ub_out[%offset], %mask {dist = "1PT_B16"} : !pto.vreg<128xi16>, !pto.ptr, !pto.mask - scf.yield %next_remaining : i32 - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg1, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/vector-load-store/vsts-1pt-b16/launch.cpp b/test/vpto/cases/micro-op/vector-load-store/vsts-1pt-b16/launch.cpp deleted file mode 100644 index 3514bffb8..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vsts-1pt-b16/launch.cpp +++ /dev/null @@ -1,71 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/vector-load-store/vsts -// family: vector-load-store -// target_ops: pto.vsts -// scenarios: core-f32, contiguous, full-mask, aligned, dist-norm -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -// --------------------------------------------------------------------------- -// PTOAS compatibility layer -// -// The upstream pto-isa headers reference some FP8/FP4 types and the -// __VEC_SCOPE__ marker that are not available on every AICore arch/toolchain -// combination (e.g. __NPU_ARCH__==2201). -// -// For our PTOAS-generated kernels we don't rely on these types today, but the -// headers still mention them in templates/static_asserts. Provide minimal -// fallbacks to keep compilation working on dav-c220. -// --------------------------------------------------------------------------- -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include - -// AICore printf support is gated behind `--cce-enable-print` on some -// toolchains. When enabled, include the CCE print header so `cce::printf` -// resolves in device compilation. -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -// Some PTO-ISA types are only available in the __CCE_AICORE__ compilation -// path, but `bisheng -xcce` still performs a host-side parse pass. -// Provide minimal fallbacks only when the corresponding header wasn't -// pulled in by the selected arch implementation. -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vsts_1pt_b16_kernel(__gm__ float *v1, - __gm__ float *v2); - -void LaunchVabs_kernel_2d(float *v1, float *v2, void *stream) { - vsts_1pt_b16_kernel<<<1, nullptr, stream>>>((__gm__ float *)v1, - (__gm__ float *)v2); -} diff --git a/test/vpto/cases/micro-op/vector-load-store/vsts-1pt-b16/main.cpp b/test/vpto/cases/micro-op/vector-load-store/vsts-1pt-b16/main.cpp deleted file mode 100644 index 6bc7026e2..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vsts-1pt-b16/main.cpp +++ /dev/null @@ -1,130 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/vector-load-store/vsts -// family: vector-load-store -// target_ops: pto.vsts -// scenarios: core-f32, contiguous, full-mask, aligned, dist-norm -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -This program is free software, you can redistribute it and/or modify it under the terms and conditions of -CANN Open Software License Agreement Version 2.0 (the "License"). -Please refer to the License for details. You may not use this file except in compliance with the License. -THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -See LICENSE in the root of the software repository for the full text of the License. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include -#include - -using namespace PtoTestCommon; - -#ifndef TMRGSORT_HPP -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, (int)_ret, __FILE__, __LINE__); \ - const char *_recent = aclGetRecentErrMsg(); \ - if (_recent != nullptr && _recent[0] != '\0') { \ - std::fprintf(stderr, "[ERROR] RecentErrMsg: %s\n", _recent); \ - } \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVabs_kernel_2d(float *v1, float *v2, void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(float); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(float); - float *v1Host = nullptr; - float *v1Device = nullptr; - float *v2Host = nullptr; - float *v2Device = nullptr; - - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { - deviceId = std::atoi(envDevice); - } - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVabs_kernel_2d(v1Device, v2Device, stream); - - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v2Host, fileSize_v2, v2Device, fileSize_v2, ACL_MEMCPY_DEVICE_TO_HOST)); - - WriteFile("./v2.bin", v2Host, fileSize_v2); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - if (stream != nullptr) { - const aclError _ret = aclrtDestroyStream(stream); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtDestroyStream(stream)", (int)_ret, __FILE__, __LINE__); - } - stream = nullptr; - } - if (deviceSet) { - const aclError _ret = aclrtResetDevice(deviceId); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtResetDevice(deviceId)", (int)_ret, __FILE__, __LINE__); - } - } - if (aclInited) { - const aclError _ret = aclFinalize(); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclFinalize()", (int)_ret, __FILE__, __LINE__); - } - } - - return rc; -} diff --git a/test/vpto/cases/micro-op/vector-load-store/vsts-pk-b16/compare.py b/test/vpto/cases/micro-op/vector-load-store/vsts-pk-b16/compare.py deleted file mode 100755 index 058c478a5..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vsts-pk-b16/compare.py +++ /dev/null @@ -1,91 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/vector-load-store/vsts-pk-b16 -# family: vector-load-store -# target_ops: pto.vsts -# scenarios: core-i16, full-mask, aligned, dist-pk-b16 -# coding=utf-8 - -import os -import sys -import numpy as np - -OUTPUT_BUFFER_BYTES = 4096 -# Keep this aligned with kernel.pto loop bound (offset: 0..1024 step 128 on i16). -ACTIVE_ELEMS = 1024 -LANES = 128 -BYTES_PER_ELEM = 2 - - -def build_checked_mask(total_bytes): - # For this case kernel: - # - loop offset: 0..1024 step 128 (i16 elements) - # - dist=PK_B16 stores 1 byte per active i16 element - # So each iteration writes 128 bytes at dst_byte_base = offset * 2. - mask = np.zeros((total_bytes,), dtype=bool) - for offset in range(0, ACTIVE_ELEMS, LANES): - dst_byte_base = offset * BYTES_PER_ELEM - mask[dst_byte_base : dst_byte_base + LANES] = True - return mask - - -def compare_bin(golden_path, output_path): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden.shape} vs {output.shape}") - return False - - if golden.size != OUTPUT_BUFFER_BYTES: - print( - f"[ERROR] Unexpected byte size for this case: got {golden.size}, expected {OUTPUT_BUFFER_BYTES}" - ) - return False - - checked = build_checked_mask(golden.size) - checked_golden = golden[checked] - checked_output = output[checked] - if not np.array_equal(checked_golden, checked_output): - diff = np.nonzero(checked_golden != checked_output)[0] - idx = int(diff[0]) if diff.size else 0 - global_idx = int(np.nonzero(checked)[0][idx]) if diff.size else 0 - print( - f"[ERROR] Mismatch (checked footprint): {golden_path} vs {output_path}, " - f"first diff at checked_idx={idx}, global_idx={global_idx} " - f"(golden=0x{int(checked_golden[idx]):02x}, out=0x{int(checked_output[idx]):02x})" - ) - return False - print( - f"[INFO] compared writable footprint only: {int(np.count_nonzero(checked))}/{golden.size} bytes" - ) - return True - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_bin("golden_v2.bin", "v2.bin") - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/vector-load-store/vsts-pk-b16/golden.py b/test/vpto/cases/micro-op/vector-load-store/vsts-pk-b16/golden.py deleted file mode 100755 index b0ebf667c..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vsts-pk-b16/golden.py +++ /dev/null @@ -1,65 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/vector-load-store/vsts-pk-b16 -# family: vector-load-store -# target_ops: pto.vsts -# scenarios: core-i16, full-mask, aligned, dist-pk-b16 -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -OUTPUT_BUFFER_BYTES = 4096 -TOTAL_ELEMS_I16 = OUTPUT_BUFFER_BYTES // 2 -# This case kernel only iterates 0..1024 on i16 lanes, so only 1024 packed bytes -# are semantically writable by vsts(pk_b16) in this testcase. -ACTIVE_ELEMS = 1024 -LANES = 128 -SEED = 19 - - -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) - v1 = rng.integers(-(2**15), 2**15, size=(TOTAL_ELEMS_I16,), dtype=np.int16) - v2 = rng.integers(0, 256, size=(OUTPUT_BUFFER_BYTES,), dtype=np.uint8) - golden_v2 = v2.copy() - - # PK_B16: write low 8 bits of each active b16 element as a compact byte stream. - # Destination address is unchanged for non-post-update form; within each 256B - # lane chunk only the first 128B are overwritten. - v1_u16 = v1.view(np.uint16) - packed_bytes_per_chunk = LANES - for offset in range(0, ACTIVE_ELEMS, LANES): - src = v1_u16[offset : offset + LANES] - packed = (src & 0x00FF).astype(np.uint8) - dst_byte_base = offset * 2 - golden_v2[dst_byte_base : dst_byte_base + packed_bytes_per_chunk] = packed - - output_dir.mkdir(parents=True, exist_ok=True) - v1.tofile(output_dir / "v1.bin") - v2.tofile(output_dir / "v2.bin") - golden_v2.tofile(output_dir / "golden_v2.bin") - - -def main() -> None: - parser = argparse.ArgumentParser( - description="Generate numpy-based inputs/golden for VPTO micro-op vsts PK_B16 validation." - ) - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/vector-load-store/vsts-pk-b16/kernel.pto b/test/vpto/cases/micro-op/vector-load-store/vsts-pk-b16/kernel.pto deleted file mode 100644 index abe05b2d3..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vsts-pk-b16/kernel.pto +++ /dev/null @@ -1,50 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/vector-load-store/vsts-pk-b16 -// family: vector-load-store -// target_ops: pto.vsts -// scenarios: core-i16, full-mask, aligned, dist-pk-b16 -// ----------------------------------------------------------------------------- -// Validate one representative `PK_B16` store distribution on `b16`. - -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vsts_pk_b16_kernel(%arg0: !pto.ptr, %arg1: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c128 = arith.constant 128 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c128_i64 = arith.constant 128 : i64 - %c4096_i64 = arith.constant 4096 : i64 - %c1024_i32 = arith.constant 1024 : i32 - - %ub_in = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c4096_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_in, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %_:1 = scf.for %offset = %c0 to %c1024 step %c128 iter_args(%remaining = %c1024_i32) -> (i32) { - %mask, %next_remaining = pto.plt_b16 %remaining : i32 -> !pto.mask, i32 - %vec = pto.vlds %ub_in[%offset] : !pto.ptr -> !pto.vreg<128xi16> - pto.vsts %vec, %ub_out[%offset], %mask {dist = "PK_B16"} : !pto.vreg<128xi16>, !pto.ptr, !pto.mask - scf.yield %next_remaining : i32 - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg1, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/vector-load-store/vsts-pk-b16/launch.cpp b/test/vpto/cases/micro-op/vector-load-store/vsts-pk-b16/launch.cpp deleted file mode 100644 index 9a902908c..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vsts-pk-b16/launch.cpp +++ /dev/null @@ -1,71 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/vector-load-store/vsts -// family: vector-load-store -// target_ops: pto.vsts -// scenarios: core-f32, contiguous, full-mask, aligned, dist-norm -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -// --------------------------------------------------------------------------- -// PTOAS compatibility layer -// -// The upstream pto-isa headers reference some FP8/FP4 types and the -// __VEC_SCOPE__ marker that are not available on every AICore arch/toolchain -// combination (e.g. __NPU_ARCH__==2201). -// -// For our PTOAS-generated kernels we don't rely on these types today, but the -// headers still mention them in templates/static_asserts. Provide minimal -// fallbacks to keep compilation working on dav-c220. -// --------------------------------------------------------------------------- -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include - -// AICore printf support is gated behind `--cce-enable-print` on some -// toolchains. When enabled, include the CCE print header so `cce::printf` -// resolves in device compilation. -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -// Some PTO-ISA types are only available in the __CCE_AICORE__ compilation -// path, but `bisheng -xcce` still performs a host-side parse pass. -// Provide minimal fallbacks only when the corresponding header wasn't -// pulled in by the selected arch implementation. -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vsts_pk_b16_kernel(__gm__ float *v1, - __gm__ float *v2); - -void LaunchVabs_kernel_2d(float *v1, float *v2, void *stream) { - vsts_pk_b16_kernel<<<1, nullptr, stream>>>((__gm__ float *)v1, - (__gm__ float *)v2); -} diff --git a/test/vpto/cases/micro-op/vector-load-store/vsts-pk-b16/main.cpp b/test/vpto/cases/micro-op/vector-load-store/vsts-pk-b16/main.cpp deleted file mode 100644 index 6bc7026e2..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vsts-pk-b16/main.cpp +++ /dev/null @@ -1,130 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/vector-load-store/vsts -// family: vector-load-store -// target_ops: pto.vsts -// scenarios: core-f32, contiguous, full-mask, aligned, dist-norm -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -This program is free software, you can redistribute it and/or modify it under the terms and conditions of -CANN Open Software License Agreement Version 2.0 (the "License"). -Please refer to the License for details. You may not use this file except in compliance with the License. -THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -See LICENSE in the root of the software repository for the full text of the License. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include -#include - -using namespace PtoTestCommon; - -#ifndef TMRGSORT_HPP -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, (int)_ret, __FILE__, __LINE__); \ - const char *_recent = aclGetRecentErrMsg(); \ - if (_recent != nullptr && _recent[0] != '\0') { \ - std::fprintf(stderr, "[ERROR] RecentErrMsg: %s\n", _recent); \ - } \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVabs_kernel_2d(float *v1, float *v2, void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(float); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(float); - float *v1Host = nullptr; - float *v1Device = nullptr; - float *v2Host = nullptr; - float *v2Device = nullptr; - - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { - deviceId = std::atoi(envDevice); - } - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVabs_kernel_2d(v1Device, v2Device, stream); - - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v2Host, fileSize_v2, v2Device, fileSize_v2, ACL_MEMCPY_DEVICE_TO_HOST)); - - WriteFile("./v2.bin", v2Host, fileSize_v2); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - if (stream != nullptr) { - const aclError _ret = aclrtDestroyStream(stream); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtDestroyStream(stream)", (int)_ret, __FILE__, __LINE__); - } - stream = nullptr; - } - if (deviceSet) { - const aclError _ret = aclrtResetDevice(deviceId); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtResetDevice(deviceId)", (int)_ret, __FILE__, __LINE__); - } - } - if (aclInited) { - const aclError _ret = aclFinalize(); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclFinalize()", (int)_ret, __FILE__, __LINE__); - } - } - - return rc; -} diff --git a/test/vpto/cases/micro-op/vector-load-store/vsts-pk-b64-f32/compare.py b/test/vpto/cases/micro-op/vector-load-store/vsts-pk-b64-f32/compare.py deleted file mode 100644 index 4c19eb038..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vsts-pk-b64-f32/compare.py +++ /dev/null @@ -1,40 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - - -import os -import sys - -import numpy as np - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(golden_path) or not os.path.exists(output_path): - return False - golden = np.fromfile(golden_path, dtype=dtype) - output = np.fromfile(output_path, dtype=dtype) - return golden.shape == output.shape and np.allclose( - golden, output, atol=eps, rtol=eps, equal_nan=True - ) - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_bin("golden_v2.bin", "v2.bin", np.float32, 0.0) - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/vector-load-store/vsts-pk-b64-f32/golden.py b/test/vpto/cases/micro-op/vector-load-store/vsts-pk-b64-f32/golden.py deleted file mode 100644 index c5635db7c..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vsts-pk-b64-f32/golden.py +++ /dev/null @@ -1,54 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/vector-load-store/vsts-pk-b64-f32 -# family: vector-load-store -# target_ops: pto.vsts -# scenarios: core-f32, full-mask, aligned, dist-pk-b64, width-agnostic-dist - -import argparse -from pathlib import Path - -import numpy as np - - -ELEMENTS = 1024 -LANES = 64 -SEED = 19 - - -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) - v1 = rng.uniform(-8.0, 8.0, size=(ELEMENTS,)).astype(np.float32) - v2 = rng.uniform(-8.0, 8.0, size=(ELEMENTS,)).astype(np.float32) - golden_v2 = np.array(v2, copy=True) - - for offset in range(0, ELEMENTS, LANES): - chunk = v1[offset : offset + LANES] - packed = chunk[0::2] - golden_v2[offset : offset + packed.size] = packed - - output_dir.mkdir(parents=True, exist_ok=True) - v1.tofile(output_dir / "v1.bin") - v2.tofile(output_dir / "v2.bin") - golden_v2.tofile(output_dir / "golden_v2.bin") - - -def main() -> None: - parser = argparse.ArgumentParser( - description="Generate numpy-based inputs/golden for VPTO micro-op vsts PK_B64 on f32 validation." - ) - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/vector-load-store/vsts-pk-b64-f32/kernel.pto b/test/vpto/cases/micro-op/vector-load-store/vsts-pk-b64-f32/kernel.pto deleted file mode 100644 index 315724f96..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vsts-pk-b64-f32/kernel.pto +++ /dev/null @@ -1,50 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/vector-load-store/vsts-pk-b64-f32 -// family: vector-load-store -// target_ops: pto.vsts -// scenarios: core-f32, full-mask, aligned, dist-pk-b64, width-agnostic-dist -// ----------------------------------------------------------------------------- -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vsts_pk_b64_f32_kernel(%arg0: !pto.ptr, %arg1: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c64 = arith.constant 64 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c128_i64 = arith.constant 128 : i64 - %c4096_i64 = arith.constant 4096 : i64 - %c1024_i32 = arith.constant 1024 : i32 - - %ub_in = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c4096_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_in, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - pto.mte_gm_ub %arg1, %ub_out, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %_:1 = scf.for %offset = %c0 to %c1024 step %c64 iter_args(%remaining = %c1024_i32) -> (i32) { - %mask, %next_remaining = pto.plt_b32 %remaining : i32 -> !pto.mask, i32 - %vec = pto.vlds %ub_in[%offset] : !pto.ptr -> !pto.vreg<64xf32> - pto.vsts %vec, %ub_out[%offset], %mask {dist = "PK_B64"} : !pto.vreg<64xf32>, !pto.ptr, !pto.mask - scf.yield %next_remaining : i32 - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg1, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/vector-load-store/vsts-pk-b64-f32/launch.cpp b/test/vpto/cases/micro-op/vector-load-store/vsts-pk-b64-f32/launch.cpp deleted file mode 100644 index 039f11e40..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vsts-pk-b64-f32/launch.cpp +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include - -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vsts_pk_b64_f32_kernel(__gm__ float *v1, - __gm__ float *v2); - -void LaunchVsts_pk_b64_f32_kernel(float *v1, float *v2, void *stream) { - vsts_pk_b64_f32_kernel<<<1, nullptr, stream>>>((__gm__ float *)v1, (__gm__ float *)v2); -} diff --git a/test/vpto/cases/micro-op/vector-load-store/vsts-pk-b64-f32/main.cpp b/test/vpto/cases/micro-op/vector-load-store/vsts-pk-b64-f32/main.cpp deleted file mode 100644 index 707a88e05..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vsts-pk-b64-f32/main.cpp +++ /dev/null @@ -1,122 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -This program is free software, you can redistribute it and/or modify it under the terms and conditions of -CANN Open Software License Agreement Version 2.0 (the "License"). -Please refer to the License for details. You may not use this file except in compliance with the License. -THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -See LICENSE in the root of the software repository for the full text of the License. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include -#include - -using namespace PtoTestCommon; - -#ifndef TMRGSORT_HPP -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, (int)_ret, __FILE__, __LINE__); \ - const char *_recent = aclGetRecentErrMsg(); \ - if (_recent != nullptr && _recent[0] != '\0') { \ - std::fprintf(stderr, "[ERROR] RecentErrMsg: %s\n", _recent); \ - } \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVsts_pk_b64_f32_kernel(float *v1, float *v2, void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(float); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(float); - float *v1Host = nullptr; - float *v1Device = nullptr; - float *v2Host = nullptr; - float *v2Device = nullptr; - - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { - deviceId = std::atoi(envDevice); - } - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVsts_pk_b64_f32_kernel(v1Device, v2Device, stream); - - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v2Host, fileSize_v2, v2Device, fileSize_v2, ACL_MEMCPY_DEVICE_TO_HOST)); - - WriteFile("./v2.bin", v2Host, fileSize_v2); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - if (stream != nullptr) { - const aclError _ret = aclrtDestroyStream(stream); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtDestroyStream(stream)", (int)_ret, __FILE__, __LINE__); - } - stream = nullptr; - } - if (deviceSet) { - const aclError _ret = aclrtResetDevice(deviceId); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtResetDevice(deviceId)", (int)_ret, __FILE__, __LINE__); - } - } - if (aclInited) { - const aclError _ret = aclFinalize(); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclFinalize()", (int)_ret, __FILE__, __LINE__); - } - } - - return rc; -} diff --git a/test/vpto/cases/micro-op/vector-load-store/vsts-post-update/compare.py b/test/vpto/cases/micro-op/vector-load-store/vsts-post-update/compare.py deleted file mode 100644 index f0e4cc2b9..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vsts-post-update/compare.py +++ /dev/null @@ -1,47 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -import os -import sys - -import numpy as np - - -def main() -> None: - golden_path = "golden_output.bin" - output_path = "output.bin" - strict = os.getenv("COMPARE_STRICT", "1") != "0" - - if not os.path.exists(golden_path) or not os.path.exists(output_path): - print("[ERROR] missing golden_output.bin or output.bin") - sys.exit(2 if strict else 0) - - golden = np.fromfile(golden_path, dtype=np.float32) - output = np.fromfile(output_path, dtype=np.float32) - ok = golden.shape == output.shape and np.allclose( - golden, output, atol=0.0001, rtol=0.0001, equal_nan=True - ) - if not ok: - if golden.shape != output.shape: - print(f"[ERROR] shape mismatch: {golden.shape} vs {output.shape}") - elif golden.size: - diff = np.abs(golden.astype(np.float64) - output.astype(np.float64)) - idx = int(np.argmax(diff)) - print( - f"[ERROR] mismatch at idx={idx}: golden={golden[idx]} " - f"output={output[idx]} diff={diff[idx]}" - ) - if strict: - sys.exit(2) - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/vector-load-store/vsts-post-update/golden.py b/test/vpto/cases/micro-op/vector-load-store/vsts-post-update/golden.py deleted file mode 100644 index 1a5839389..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vsts-post-update/golden.py +++ /dev/null @@ -1,42 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -import argparse -from pathlib import Path - -import numpy as np - - -ELEMENTS = 1024 -SEED = 19 - - -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) - data = rng.uniform(-8.0, 8.0, size=(ELEMENTS,)).astype(np.float32) - output = np.zeros((ELEMENTS,), dtype=np.float32) - - output_dir.mkdir(parents=True, exist_ok=True) - data.tofile(output_dir / "input.bin") - output.tofile(output_dir / "output.bin") - data.tofile(output_dir / "golden_output.bin") - - -def main() -> None: - parser = argparse.ArgumentParser( - description="Generate inputs/golden for VPTO vsts post-update validation." - ) - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/vector-load-store/vsts-post-update/kernel.pto b/test/vpto/cases/micro-op/vector-load-store/vsts-post-update/kernel.pto deleted file mode 100644 index 771f9cbcf..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vsts-post-update/kernel.pto +++ /dev/null @@ -1,54 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// case: micro-op/vector-load-store/vsts-post-update -// family: vector-load-store -// target_ops: pto.vsts -// scenarios: core-f32, contiguous, full-mask, post-update-result, dist-norm - -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vsts_post_update_kernel(%input: !pto.ptr, %output: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c64 = arith.constant 64 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c32_i64 = arith.constant 32 : i64 - %c128_i64 = arith.constant 128 : i64 - %c4096_i64 = arith.constant 4096 : i64 - %c1024_i32 = arith.constant 1024 : i32 - - %ub_in = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c4096_i64 : i64 -> !pto.ptr - - pto.mte_gm_ub %input, %ub_in, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %_:2 = scf.for %offset = %c0 to %c1024 step %c64 - iter_args(%remaining = %c1024_i32, %dst = %ub_out) - -> (i32, !pto.ptr) { - %mask, %next_remaining = pto.plt_b32 %remaining : i32 -> !pto.mask, i32 - %vec = pto.vlds %ub_in[%offset] : !pto.ptr -> !pto.vreg<64xf32> - %updated = pto.vsts %vec, %dst[%c64], %mask {dist = "NORM_B32"} : !pto.vreg<64xf32>, !pto.ptr, !pto.mask -> !pto.ptr - scf.yield %next_remaining, %updated : i32, !pto.ptr - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %output, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/vector-load-store/vsts-post-update/launch.cpp b/test/vpto/cases/micro-op/vector-load-store/vsts-post-update/launch.cpp deleted file mode 100644 index f0a79436c..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vsts-post-update/launch.cpp +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif - -#include - -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif - -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void -vsts_post_update_kernel(__gm__ float *input, __gm__ float *output); - -void LaunchVstsPostUpdate(float *input, float *output, void *stream) { - vsts_post_update_kernel<<<1, nullptr, stream>>>((__gm__ float *)input, - (__gm__ float *)output); -} diff --git a/test/vpto/cases/micro-op/vector-load-store/vsts-post-update/main.cpp b/test/vpto/cases/micro-op/vector-load-store/vsts-post-update/main.cpp deleted file mode 100644 index c12451840..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vsts-post-update/main.cpp +++ /dev/null @@ -1,103 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -#include "acl/acl.h" -#include "test_common.h" - -#include -#include - -using namespace PtoTestCommon; - -void LaunchVstsPostUpdate(float *input, float *output, void *stream); - -namespace { -constexpr size_t kElementCount = 1024; -constexpr size_t kBufferSize = kElementCount * sizeof(float); -} - -#define ACL_CHECK(expr) \ - do { \ - const aclError ret = (expr); \ - if (ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, \ - static_cast(ret), __FILE__, __LINE__); \ - const char *recent = aclGetRecentErrMsg(); \ - if (recent != nullptr && recent[0] != '\0') \ - std::fprintf(stderr, "[ERROR] RecentErrMsg: %s\n", recent); \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -int main() { - float *inputHost = nullptr; - float *outputHost = nullptr; - float *inputDevice = nullptr; - float *outputDevice = nullptr; - aclrtStream stream = nullptr; - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - size_t inputSize = kBufferSize; - size_t outputSize = kBufferSize; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) - deviceId = std::atoi(envDevice); - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - - ACL_CHECK(aclrtMallocHost(reinterpret_cast(&inputHost), kBufferSize)); - ACL_CHECK(aclrtMallocHost(reinterpret_cast(&outputHost), kBufferSize)); - ACL_CHECK(aclrtMalloc(reinterpret_cast(&inputDevice), kBufferSize, - ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc(reinterpret_cast(&outputDevice), kBufferSize, - ACL_MEM_MALLOC_HUGE_FIRST)); - - ReadFile("./input.bin", inputSize, inputHost, kBufferSize); - ReadFile("./output.bin", outputSize, outputHost, kBufferSize); - ACL_CHECK(aclrtMemcpy(inputDevice, kBufferSize, inputHost, kBufferSize, - ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(outputDevice, kBufferSize, outputHost, kBufferSize, - ACL_MEMCPY_HOST_TO_DEVICE)); - - LaunchVstsPostUpdate(inputDevice, outputDevice, stream); - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(outputHost, kBufferSize, outputDevice, kBufferSize, - ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./output.bin", outputHost, kBufferSize); - -cleanup: - aclrtFree(inputDevice); - aclrtFree(outputDevice); - aclrtFreeHost(inputHost); - aclrtFreeHost(outputHost); - if (stream != nullptr) { - const aclError ret = aclrtDestroyStream(stream); - if (ret != ACL_SUCCESS) - std::fprintf(stderr, "[ERROR] aclrtDestroyStream failed: %d\n", - static_cast(ret)); - } - if (deviceSet) { - const aclError ret = aclrtResetDevice(deviceId); - if (ret != ACL_SUCCESS) - std::fprintf(stderr, "[ERROR] aclrtResetDevice failed: %d\n", - static_cast(ret)); - } - if (aclInited) { - const aclError ret = aclFinalize(); - if (ret != ACL_SUCCESS) - std::fprintf(stderr, "[ERROR] aclFinalize failed: %d\n", - static_cast(ret)); - } - return rc; -} diff --git a/test/vpto/cases/micro-op/vector-load-store/vsts-tail/compare.py b/test/vpto/cases/micro-op/vector-load-store/vsts-tail/compare.py deleted file mode 100755 index 1821ec6aa..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vsts-tail/compare.py +++ /dev/null @@ -1,257 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/vector-load-store/vsts-tail -# family: vector-load-store -# target_ops: pto.vsts -# scenarios: core-f32, contiguous, tail-mask, aligned, dist-norm -# NOTE: bulk-generated coverage skeleton. -# coding=utf-8 - -import os -import sys -import numpy as np - - -REPEAT_BYTES = 256 - - -def _ceil_div(x, y): - return (x + y - 1) // y - - -def _packed_pred_storage_bytes(logical_elems, src_elem_bytes): - logical_elems = int(logical_elems) - src_elem_bytes = int(src_elem_bytes) - if logical_elems <= 0: - raise ValueError(f"logical_elems must be > 0, got {logical_elems}") - if src_elem_bytes not in (1, 2, 4): - raise ValueError(f"unsupported packed predicate source size: {src_elem_bytes}") - - repeat_elems = REPEAT_BYTES // src_elem_bytes - if src_elem_bytes == 4: - repeat_times = _ceil_div(logical_elems, repeat_elems) + 1 - loop_count = repeat_times // 2 - return loop_count * 16 - - repeat_times = _ceil_div(logical_elems, repeat_elems) - return repeat_times * (repeat_elems // 8) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - g = golden.astype(np.int64, copy=False) - o = output.astype(np.int64, copy=False) - else: - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - abs_diff = np.abs(g - o) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={g[idx]}, out={o[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_bin_prefix(golden_path, output_path, dtype, eps, count): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - try: - count = int(count) - except Exception: - print(f"[ERROR] Invalid prefix count: {count}") - return False - if count <= 0: - print(f"[ERROR] Invalid prefix count: {count}") - return False - - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np, count=count) - output = np.fromfile(output_path, dtype=dtype_np, count=count) - - if golden.size != count or output.size != count: - print( - f"[ERROR] Prefix read too small: need={count} elems, " - f"golden={golden.size}, out={output.size}" - ) - return False - - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - g = golden.astype(np.int64, copy=False) - o = output.astype(np.int64, copy=False) - else: - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - abs_diff = np.abs(g - o) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch (prefix): {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={g[idx]}, out={o[idx]}, dtype={dtype_np}, count={count})" - ) - else: - print(f"[ERROR] Mismatch (prefix): {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_bin_window(golden_path, output_path, dtype, eps, offset, count): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - try: - offset = int(offset) - count = int(count) - except Exception: - print(f"[ERROR] Invalid compare window: offset={offset} count={count}") - return False - if offset < 0 or count <= 0: - print(f"[ERROR] Invalid compare window: offset={offset} count={count}") - return False - - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - end = offset + count - if golden.size < end or output.size < end: - print( - f"[ERROR] Compare window out of range: offset={offset} count={count}, " - f"golden={golden.size}, out={output.size}" - ) - return False - - golden_sel = golden[offset:end] - output_sel = output[offset:end] - if not np.allclose(golden_sel, output_sel, atol=eps, rtol=eps, equal_nan=True): - if golden_sel.size: - g = golden_sel.astype(np.float64, copy=False) - o = output_sel.astype(np.float64, copy=False) - abs_diff = np.abs(g - o) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch (window): {golden_path} vs {output_path}, max diff={diff} " - f"at idx={offset + idx} (golden={g[idx]}, out={o[idx]}, dtype={dtype_np}, " - f"offset={offset}, count={count})" - ) - else: - print(f"[ERROR] Mismatch (window): {golden_path} vs {output_path}, empty window, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, logical_elems, src_elem_bytes): - """ - Compare outputs of pto.tcmp / pto.tcmps. - - PTO-ISA stores packed predicate results as a linear PK byte stream via - `psts`, with the exact written prefix length determined by the typed - TCMP/TCMPS repeat schedule. Compare only that semantic prefix. - """ - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - try: - logical_elems = int(logical_elems) - src_elem_bytes = int(src_elem_bytes) - except Exception: - print( - "[ERROR] Invalid packed mask compare arguments: " - f"logical_elems={logical_elems} src_elem_bytes={src_elem_bytes}" - ) - return False - if logical_elems <= 0 or src_elem_bytes <= 0: - print( - "[ERROR] Invalid packed mask compare arguments: " - f"logical_elems={logical_elems} src_elem_bytes={src_elem_bytes}" - ) - return False - - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - try: - prefix_bytes = _packed_pred_storage_bytes(logical_elems, src_elem_bytes) - except ValueError as exc: - print(f"[ERROR] {exc}") - return False - - if golden.size < prefix_bytes or output.size < prefix_bytes: - print( - f"[ERROR] Packed mask buffer too small: need={prefix_bytes} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - - golden_sel = golden[:prefix_bytes] - output_sel = output[:prefix_bytes] - - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, " - f"logical_elems={logical_elems}, src_elem_bytes={src_elem_bytes}, prefix_bytes={prefix_bytes})" - ) - return False - return True - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = True - ok = compare_bin_window("golden_v2.bin", "v2.bin", np.float32, 0.0001, 0, 13) and ok - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/vector-load-store/vsts-tail/golden.py b/test/vpto/cases/micro-op/vector-load-store/vsts-tail/golden.py deleted file mode 100755 index 73bd90f99..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vsts-tail/golden.py +++ /dev/null @@ -1,51 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/vector-load-store/vsts-tail -# family: vector-load-store -# target_ops: pto.vsts -# scenarios: core-f32, contiguous, tail-mask, aligned, dist-norm -# NOTE: bulk-generated coverage skeleton. -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -ROWS = 32 -COLS = 32 -ACTIVE = 13 -SEED = 19 -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) - v1 = rng.uniform(-8.0, 8.0, size=(ROWS, COLS)).astype(np.float32) - v2 = np.zeros((ROWS, COLS), dtype=np.float32) - golden_v2 = np.zeros((ROWS, COLS), dtype=np.float32) - golden_v2.reshape(-1)[:ACTIVE] = v1.reshape(-1)[:ACTIVE] - - output_dir.mkdir(parents=True, exist_ok=True) - v1.reshape(-1).tofile(output_dir / "v1.bin") - v2.reshape(-1).tofile(output_dir / "v2.bin") - golden_v2.reshape(-1).tofile(output_dir / "golden_v2.bin") - - -def main() -> None: - parser = argparse.ArgumentParser( - description="Generate numpy-based inputs/golden for VPTO micro-op vsts-tail validation." - ) - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/vector-load-store/vsts-tail/kernel.pto b/test/vpto/cases/micro-op/vector-load-store/vsts-tail/kernel.pto deleted file mode 100644 index 901e975f4..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vsts-tail/kernel.pto +++ /dev/null @@ -1,45 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/vector-load-store/vsts-tail -// family: vector-load-store -// target_ops: pto.vsts -// scenarios: core-f32, contiguous, tail-mask, aligned, dist-norm -// ----------------------------------------------------------------------------- -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vsts_tail_kernel(%arg0: !pto.ptr, %arg1: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1_i64 = arith.constant 1 : i64 - %c0_i64 = arith.constant 0 : i64 - %c32_i64 = arith.constant 32 : i64 - %c128_i64 = arith.constant 128 : i64 - %c4096_i64 = arith.constant 4096 : i64 - %c13_i32 = arith.constant 13 : i32 - %false = arith.constant false - - %ub_in = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c4096_i64 : i64 -> !pto.ptr - pto.mte_gm_ub %arg0, %ub_in, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - %c1 = arith.constant 1 : index - pto.vecscope { - scf.for %iv = %c0 to %c1 step %c1 { - %mask, %remaining = pto.plt_b32 %c13_i32 : i32 -> !pto.mask, i32 - %vec = pto.vlds %ub_in[%c0] : !pto.ptr -> !pto.vreg<64xf32> - pto.vsts %vec, %ub_out[%c0], %mask - : !pto.vreg<64xf32>, !pto.ptr, !pto.mask - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg1, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/vector-load-store/vsts-tail/launch.cpp b/test/vpto/cases/micro-op/vector-load-store/vsts-tail/launch.cpp deleted file mode 100644 index 2a94e832d..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vsts-tail/launch.cpp +++ /dev/null @@ -1,70 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/vector-load-store/vsts-tail -// family: vector-load-store -// target_ops: pto.vsts -// scenarios: core-f32, contiguous, tail-mask, aligned, dist-norm -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -// --------------------------------------------------------------------------- -// PTOAS compatibility layer -// -// The upstream pto-isa headers reference some FP8/FP4 types and the -// __VEC_SCOPE__ marker that are not available on every AICore arch/toolchain -// combination (e.g. __NPU_ARCH__==2201). -// -// For our PTOAS-generated kernels we don't rely on these types today, but the -// headers still mention them in templates/static_asserts. Provide minimal -// fallbacks to keep compilation working on dav-c220. -// --------------------------------------------------------------------------- -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include - -// AICore printf support is gated behind `--cce-enable-print` on some -// toolchains. When enabled, include the CCE print header so `cce::printf` -// resolves in device compilation. -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -// Some PTO-ISA types are only available in the __CCE_AICORE__ compilation -// path, but `bisheng -xcce` still performs a host-side parse pass. -// Provide minimal fallbacks only when the corresponding header wasn't -// pulled in by the selected arch implementation. -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vsts_tail_kernel(__gm__ float *v1, - __gm__ float *v2); - -void LaunchVabs_kernel_2d(float *v1, float *v2, void *stream) { - vsts_tail_kernel<<<1, nullptr, stream>>>((__gm__ float *)v1, (__gm__ float *)v2); -} diff --git a/test/vpto/cases/micro-op/vector-load-store/vsts-tail/main.cpp b/test/vpto/cases/micro-op/vector-load-store/vsts-tail/main.cpp deleted file mode 100644 index f8da4c77a..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vsts-tail/main.cpp +++ /dev/null @@ -1,130 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/vector-load-store/vsts-tail -// family: vector-load-store -// target_ops: pto.vsts -// scenarios: core-f32, contiguous, tail-mask, aligned, dist-norm -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -This program is free software, you can redistribute it and/or modify it under the terms and conditions of -CANN Open Software License Agreement Version 2.0 (the "License"). -Please refer to the License for details. You may not use this file except in compliance with the License. -THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -See LICENSE in the root of the software repository for the full text of the License. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include -#include - -using namespace PtoTestCommon; - -#ifndef TMRGSORT_HPP -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, (int)_ret, __FILE__, __LINE__); \ - const char *_recent = aclGetRecentErrMsg(); \ - if (_recent != nullptr && _recent[0] != '\0') { \ - std::fprintf(stderr, "[ERROR] RecentErrMsg: %s\n", _recent); \ - } \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVabs_kernel_2d(float *v1, float *v2, void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(float); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(float); - float *v1Host = nullptr; - float *v1Device = nullptr; - float *v2Host = nullptr; - float *v2Device = nullptr; - - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { - deviceId = std::atoi(envDevice); - } - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVabs_kernel_2d(v1Device, v2Device, stream); - - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v2Host, fileSize_v2, v2Device, fileSize_v2, ACL_MEMCPY_DEVICE_TO_HOST)); - - WriteFile("./v2.bin", v2Host, fileSize_v2); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - if (stream != nullptr) { - const aclError _ret = aclrtDestroyStream(stream); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtDestroyStream(stream)", (int)_ret, __FILE__, __LINE__); - } - stream = nullptr; - } - if (deviceSet) { - const aclError _ret = aclrtResetDevice(deviceId); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtResetDevice(deviceId)", (int)_ret, __FILE__, __LINE__); - } - } - if (aclInited) { - const aclError _ret = aclFinalize(); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclFinalize()", (int)_ret, __FILE__, __LINE__); - } - } - - return rc; -} diff --git a/test/vpto/cases/micro-op/vector-load-store/vsts/compare.py b/test/vpto/cases/micro-op/vector-load-store/vsts/compare.py deleted file mode 100755 index dc064cb22..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vsts/compare.py +++ /dev/null @@ -1,209 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/vector-load-store/vsts -# family: vector-load-store -# target_ops: pto.vsts -# scenarios: core-f32, contiguous, full-mask, aligned, dist-norm -# NOTE: bulk-generated coverage skeleton. -# coding=utf-8 - -import os -import sys -import numpy as np - - -REPEAT_BYTES = 256 - - -def _ceil_div(x, y): - return (x + y - 1) // y - - -def _packed_pred_storage_bytes(logical_elems, src_elem_bytes): - logical_elems = int(logical_elems) - src_elem_bytes = int(src_elem_bytes) - if logical_elems <= 0: - raise ValueError(f"logical_elems must be > 0, got {logical_elems}") - if src_elem_bytes not in (1, 2, 4): - raise ValueError(f"unsupported packed predicate source size: {src_elem_bytes}") - - repeat_elems = REPEAT_BYTES // src_elem_bytes - if src_elem_bytes == 4: - repeat_times = _ceil_div(logical_elems, repeat_elems) + 1 - loop_count = repeat_times // 2 - return loop_count * 16 - - repeat_times = _ceil_div(logical_elems, repeat_elems) - return repeat_times * (repeat_elems // 8) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - g = golden.astype(np.int64, copy=False) - o = output.astype(np.int64, copy=False) - else: - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - abs_diff = np.abs(g - o) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={g[idx]}, out={o[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_bin_prefix(golden_path, output_path, dtype, eps, count): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - try: - count = int(count) - except Exception: - print(f"[ERROR] Invalid prefix count: {count}") - return False - if count <= 0: - print(f"[ERROR] Invalid prefix count: {count}") - return False - - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np, count=count) - output = np.fromfile(output_path, dtype=dtype_np, count=count) - - if golden.size != count or output.size != count: - print( - f"[ERROR] Prefix read too small: need={count} elems, " - f"golden={golden.size}, out={output.size}" - ) - return False - - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - g = golden.astype(np.int64, copy=False) - o = output.astype(np.int64, copy=False) - else: - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - abs_diff = np.abs(g - o) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch (prefix): {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={g[idx]}, out={o[idx]}, dtype={dtype_np}, count={count})" - ) - else: - print(f"[ERROR] Mismatch (prefix): {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, logical_elems, src_elem_bytes): - """ - Compare outputs of pto.tcmp / pto.tcmps. - - PTO-ISA stores packed predicate results as a linear PK byte stream via - `psts`, with the exact written prefix length determined by the typed - TCMP/TCMPS repeat schedule. Compare only that semantic prefix. - """ - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - try: - logical_elems = int(logical_elems) - src_elem_bytes = int(src_elem_bytes) - except Exception: - print( - "[ERROR] Invalid packed mask compare arguments: " - f"logical_elems={logical_elems} src_elem_bytes={src_elem_bytes}" - ) - return False - if logical_elems <= 0 or src_elem_bytes <= 0: - print( - "[ERROR] Invalid packed mask compare arguments: " - f"logical_elems={logical_elems} src_elem_bytes={src_elem_bytes}" - ) - return False - - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - try: - prefix_bytes = _packed_pred_storage_bytes(logical_elems, src_elem_bytes) - except ValueError as exc: - print(f"[ERROR] {exc}") - return False - - if golden.size < prefix_bytes or output.size < prefix_bytes: - print( - f"[ERROR] Packed mask buffer too small: need={prefix_bytes} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - - golden_sel = golden[:prefix_bytes] - output_sel = output[:prefix_bytes] - - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, " - f"logical_elems={logical_elems}, src_elem_bytes={src_elem_bytes}, prefix_bytes={prefix_bytes})" - ) - return False - return True - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = True - ok = compare_bin("golden_v2.bin", "v2.bin", np.float32, 0.0001) and ok - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/vector-load-store/vsts/golden.py b/test/vpto/cases/micro-op/vector-load-store/vsts/golden.py deleted file mode 100755 index 9eb6e0453..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vsts/golden.py +++ /dev/null @@ -1,51 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/vector-load-store/vsts -# family: vector-load-store -# target_ops: pto.vsts -# scenarios: core-f32, contiguous, full-mask, aligned, dist-norm -# NOTE: bulk-generated coverage skeleton. -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -ROWS = 32 -COLS = 32 -SEED = 19 - - -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) - v1 = rng.uniform(-8.0, 8.0, size=(ROWS, COLS)).astype(np.float32) - v2 = np.zeros((ROWS, COLS), dtype=np.float32) - golden_v2 = v1.astype(np.float32, copy=False) - - output_dir.mkdir(parents=True, exist_ok=True) - v1.reshape(-1).tofile(output_dir / "v1.bin") - v2.reshape(-1).tofile(output_dir / "v2.bin") - golden_v2.reshape(-1).tofile(output_dir / "golden_v2.bin") - - -def main() -> None: - parser = argparse.ArgumentParser( - description="Generate numpy-based inputs/golden for VPTO micro-op vsts validation." - ) - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/vector-load-store/vsts/kernel.pto b/test/vpto/cases/micro-op/vector-load-store/vsts/kernel.pto deleted file mode 100644 index 8133b4e5f..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vsts/kernel.pto +++ /dev/null @@ -1,69 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/vector-load-store/vsts -// family: vector-load-store -// target_ops: pto.vsts -// scenarios: core-f32, contiguous, full-mask, aligned, dist-norm -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -// ============================================================================= -// abs_kernel_2d: Element-wise absolute value on a 32x32 f32 tile -// ============================================================================= -// This kernel computes abs(input) for a 32x32 float32 matrix (1024 elements). -// -// Memory Layout: -// - Input: arg0 -> GM (Global Memory) -// - Output: arg1 -> GM (Global Memory) -// - UB (Unified Buffer) at offset 0: input tile (4096 bytes = 32*32*4) -// - UB at offset 4096: output tile (4096 bytes = 32*32*4) -// -// Pipeline: -// 1. DMA: GM -> UB (MTE2 pipe) - copy input tile to UB -// 2. Sync: wait for MTE2 -> V pipe handoff -// 3. Compute: vabs on 64-element vectors (V pipe) - 16 iterations for 1024 elements -// 4. Sync: wait for V -> MTE3 pipe handoff -// 5. DMA: UB -> GM (MTE3 pipe) - copy result tile back to GM -// ============================================================================= - -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vsts_kernel_2d(%arg0: !pto.ptr, %arg1: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c64 = arith.constant 64 : index - %c1024 = arith.constant 1024 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c128_i64 = arith.constant 128 : i64 - %c4096_i64 = arith.constant 4096 : i64 - %c1024_i32 = arith.constant 1024 : i32 - - %ub_in = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c4096_i64 : i64 -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_in, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - %_:1 = scf.for %offset = %c0 to %c1024 step %c64 iter_args(%remaining = %c1024_i32) -> (i32) { - %mask, %next_remaining = pto.plt_b32 %remaining : i32 -> !pto.mask, i32 - %vec = pto.vlds %ub_in[%offset] : !pto.ptr -> !pto.vreg<64xf32> - pto.vsts %vec, %ub_out[%offset], %mask {dist = "NORM_B32"} : !pto.vreg<64xf32>, !pto.ptr, !pto.mask - scf.yield %next_remaining : i32 - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg1, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/vector-load-store/vsts/launch.cpp b/test/vpto/cases/micro-op/vector-load-store/vsts/launch.cpp deleted file mode 100644 index 851e10299..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vsts/launch.cpp +++ /dev/null @@ -1,70 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/vector-load-store/vsts -// family: vector-load-store -// target_ops: pto.vsts -// scenarios: core-f32, contiguous, full-mask, aligned, dist-norm -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -// --------------------------------------------------------------------------- -// PTOAS compatibility layer -// -// The upstream pto-isa headers reference some FP8/FP4 types and the -// __VEC_SCOPE__ marker that are not available on every AICore arch/toolchain -// combination (e.g. __NPU_ARCH__==2201). -// -// For our PTOAS-generated kernels we don't rely on these types today, but the -// headers still mention them in templates/static_asserts. Provide minimal -// fallbacks to keep compilation working on dav-c220. -// --------------------------------------------------------------------------- -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include - -// AICore printf support is gated behind `--cce-enable-print` on some -// toolchains. When enabled, include the CCE print header so `cce::printf` -// resolves in device compilation. -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -// Some PTO-ISA types are only available in the __CCE_AICORE__ compilation -// path, but `bisheng -xcce` still performs a host-side parse pass. -// Provide minimal fallbacks only when the corresponding header wasn't -// pulled in by the selected arch implementation. -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vsts_kernel_2d(__gm__ float *v1, - __gm__ float *v2); - -void LaunchVabs_kernel_2d(float *v1, float *v2, void *stream) { - vsts_kernel_2d<<<1, nullptr, stream>>>((__gm__ float *)v1, (__gm__ float *)v2); -} diff --git a/test/vpto/cases/micro-op/vector-load-store/vsts/main.cpp b/test/vpto/cases/micro-op/vector-load-store/vsts/main.cpp deleted file mode 100644 index 6bc7026e2..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vsts/main.cpp +++ /dev/null @@ -1,130 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/vector-load-store/vsts -// family: vector-load-store -// target_ops: pto.vsts -// scenarios: core-f32, contiguous, full-mask, aligned, dist-norm -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -This program is free software, you can redistribute it and/or modify it under the terms and conditions of -CANN Open Software License Agreement Version 2.0 (the "License"). -Please refer to the License for details. You may not use this file except in compliance with the License. -THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -See LICENSE in the root of the software repository for the full text of the License. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include -#include - -using namespace PtoTestCommon; - -#ifndef TMRGSORT_HPP -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, (int)_ret, __FILE__, __LINE__); \ - const char *_recent = aclGetRecentErrMsg(); \ - if (_recent != nullptr && _recent[0] != '\0') { \ - std::fprintf(stderr, "[ERROR] RecentErrMsg: %s\n", _recent); \ - } \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVabs_kernel_2d(float *v1, float *v2, void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(float); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(float); - float *v1Host = nullptr; - float *v1Device = nullptr; - float *v2Host = nullptr; - float *v2Device = nullptr; - - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { - deviceId = std::atoi(envDevice); - } - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVabs_kernel_2d(v1Device, v2Device, stream); - - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v2Host, fileSize_v2, v2Device, fileSize_v2, ACL_MEMCPY_DEVICE_TO_HOST)); - - WriteFile("./v2.bin", v2Host, fileSize_v2); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - if (stream != nullptr) { - const aclError _ret = aclrtDestroyStream(stream); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtDestroyStream(stream)", (int)_ret, __FILE__, __LINE__); - } - stream = nullptr; - } - if (deviceSet) { - const aclError _ret = aclrtResetDevice(deviceId); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtResetDevice(deviceId)", (int)_ret, __FILE__, __LINE__); - } - } - if (aclInited) { - const aclError _ret = aclFinalize(); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclFinalize()", (int)_ret, __FILE__, __LINE__); - } - } - - return rc; -} diff --git a/test/vpto/cases/micro-op/vector-load-store/vstsx2-layout-check/compare.py b/test/vpto/cases/micro-op/vector-load-store/vstsx2-layout-check/compare.py deleted file mode 100755 index b2a31f90e..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vstsx2-layout-check/compare.py +++ /dev/null @@ -1,210 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/vector-load-store/vstsx2-layout-check -# family: vector-load-store -# target_ops: pto.vstsx2 -# scenarios: core-f32, full-mask, paired-roundtrip, dintlv, lane-order -# NOTE: bulk-generated coverage skeleton. -# coding=utf-8 - -import os -import sys -import numpy as np - - -REPEAT_BYTES = 256 -PREFIX_ELEMS = 128 - - -def _ceil_div(x, y): - return (x + y - 1) // y - - -def _packed_pred_storage_bytes(logical_elems, src_elem_bytes): - logical_elems = int(logical_elems) - src_elem_bytes = int(src_elem_bytes) - if logical_elems <= 0: - raise ValueError(f"logical_elems must be > 0, got {logical_elems}") - if src_elem_bytes not in (1, 2, 4): - raise ValueError(f"unsupported packed predicate source size: {src_elem_bytes}") - - repeat_elems = REPEAT_BYTES // src_elem_bytes - if src_elem_bytes == 4: - repeat_times = _ceil_div(logical_elems, repeat_elems) + 1 - loop_count = repeat_times // 2 - return loop_count * 16 - - repeat_times = _ceil_div(logical_elems, repeat_elems) - return repeat_times * (repeat_elems // 8) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - g = golden.astype(np.int64, copy=False) - o = output.astype(np.int64, copy=False) - else: - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - abs_diff = np.abs(g - o) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={g[idx]}, out={o[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_bin_prefix(golden_path, output_path, dtype, eps, count): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - try: - count = int(count) - except Exception: - print(f"[ERROR] Invalid prefix count: {count}") - return False - if count <= 0: - print(f"[ERROR] Invalid prefix count: {count}") - return False - - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np, count=count) - output = np.fromfile(output_path, dtype=dtype_np, count=count) - - if golden.size != count or output.size != count: - print( - f"[ERROR] Prefix read too small: need={count} elems, " - f"golden={golden.size}, out={output.size}" - ) - return False - - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - g = golden.astype(np.int64, copy=False) - o = output.astype(np.int64, copy=False) - else: - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - abs_diff = np.abs(g - o) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch (prefix): {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={g[idx]}, out={o[idx]}, dtype={dtype_np}, count={count})" - ) - else: - print(f"[ERROR] Mismatch (prefix): {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, logical_elems, src_elem_bytes): - """ - Compare outputs of pto.tcmp / pto.tcmps. - - PTO-ISA stores packed predicate results as a linear PK byte stream via - `psts`, with the exact written prefix length determined by the typed - TCMP/TCMPS repeat schedule. Compare only that semantic prefix. - """ - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - try: - logical_elems = int(logical_elems) - src_elem_bytes = int(src_elem_bytes) - except Exception: - print( - "[ERROR] Invalid packed mask compare arguments: " - f"logical_elems={logical_elems} src_elem_bytes={src_elem_bytes}" - ) - return False - if logical_elems <= 0 or src_elem_bytes <= 0: - print( - "[ERROR] Invalid packed mask compare arguments: " - f"logical_elems={logical_elems} src_elem_bytes={src_elem_bytes}" - ) - return False - - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - try: - prefix_bytes = _packed_pred_storage_bytes(logical_elems, src_elem_bytes) - except ValueError as exc: - print(f"[ERROR] {exc}") - return False - - if golden.size < prefix_bytes or output.size < prefix_bytes: - print( - f"[ERROR] Packed mask buffer too small: need={prefix_bytes} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - - golden_sel = golden[:prefix_bytes] - output_sel = output[:prefix_bytes] - - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, " - f"logical_elems={logical_elems}, src_elem_bytes={src_elem_bytes}, prefix_bytes={prefix_bytes})" - ) - return False - return True - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = True - ok = compare_bin_prefix("golden_v2.bin", "v2.bin", np.float32, 0.0001, PREFIX_ELEMS) and ok - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/vector-load-store/vstsx2-layout-check/golden.py b/test/vpto/cases/micro-op/vector-load-store/vstsx2-layout-check/golden.py deleted file mode 100755 index 24665fcf7..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vstsx2-layout-check/golden.py +++ /dev/null @@ -1,56 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/vector-load-store/vstsx2-layout-check -# family: vector-load-store -# target_ops: pto.vstsx2 -# scenarios: core-f32, full-mask, paired-roundtrip, dintlv, lane-order -# NOTE: bulk-generated coverage skeleton. -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -ROWS = 32 -COLS = 32 -ACTIVE = 128 -SEED = 19 - - -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) - v1 = rng.uniform(-8.0, 8.0, size=(ROWS, COLS)).astype(np.float32) - v2 = np.zeros((ROWS, COLS), dtype=np.float32) - golden_v2 = np.zeros((ROWS, COLS), dtype=np.float32) - flat_in = v1.reshape(-1) - flat_golden = golden_v2.reshape(-1) - flat_golden[:ACTIVE:2] = flat_in[:64] - flat_golden[1:ACTIVE:2] = flat_in[64:128] - - output_dir.mkdir(parents=True, exist_ok=True) - v1.reshape(-1).tofile(output_dir / "v1.bin") - v2.reshape(-1).tofile(output_dir / "v2.bin") - golden_v2.reshape(-1).tofile(output_dir / "golden_v2.bin") - - -def main() -> None: - parser = argparse.ArgumentParser( - description="Generate numpy-based inputs/golden for VPTO micro-op vstsx2 layout validation." - ) - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/vector-load-store/vstsx2-layout-check/kernel.pto b/test/vpto/cases/micro-op/vector-load-store/vstsx2-layout-check/kernel.pto deleted file mode 100644 index 728a52839..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vstsx2-layout-check/kernel.pto +++ /dev/null @@ -1,49 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/vector-load-store/vstsx2-layout-check -// family: vector-load-store -// target_ops: pto.vstsx2 -// scenarios: core-f32, full-mask, paired-roundtrip, dintlv, lane-order -// ----------------------------------------------------------------------------- -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vstsx2_layout_check_kernel(%arg0: !pto.ptr, - %arg1: !pto.ptr) attributes {pto.kernel} { - %c0 = arith.constant 0 : index - %c64 = arith.constant 64 : index - %c1_i64 = arith.constant 1 : i64 - %c0_i64 = arith.constant 0 : i64 - %c32_i64 = arith.constant 32 : i64 - %c128_i64 = arith.constant 128 : i64 - %c4096_i64 = arith.constant 4096 : i64 - %c64_i32 = arith.constant 64 : i32 - %false = arith.constant false - - %ub_in = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c4096_i64 : i64 -> !pto.ptr - pto.mte_gm_ub %arg0, %ub_in, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - %c1 = arith.constant 1 : index - pto.vecscope { - scf.for %iv = %c0 to %c1 step %c1 { - %mask, %remaining = pto.plt_b32 %c64_i32 : i32 -> !pto.mask, i32 - %x = pto.vlds %ub_in[%c0] : !pto.ptr -> !pto.vreg<64xf32> - %y = pto.vlds %ub_in[%c64] : !pto.ptr -> !pto.vreg<64xf32> - pto.vstsx2 %x, %y, %ub_out[%c0], "INTLV_B32", %mask - : !pto.vreg<64xf32>, !pto.vreg<64xf32>, !pto.ptr, index, - !pto.mask - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg1, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/vector-load-store/vstsx2-layout-check/launch.cpp b/test/vpto/cases/micro-op/vector-load-store/vstsx2-layout-check/launch.cpp deleted file mode 100644 index bf28f8403..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vstsx2-layout-check/launch.cpp +++ /dev/null @@ -1,71 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/vector-load-store/vstsx2-layout-check -// family: vector-load-store -// target_ops: pto.vstsx2 -// scenarios: core-f32, full-mask, paired-roundtrip, dintlv, lane-order -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -// --------------------------------------------------------------------------- -// PTOAS compatibility layer -// -// The upstream pto-isa headers reference some FP8/FP4 types and the -// __VEC_SCOPE__ marker that are not available on every AICore arch/toolchain -// combination (e.g. __NPU_ARCH__==2201). -// -// For our PTOAS-generated kernels we don't rely on these types today, but the -// headers still mention them in templates/static_asserts. Provide minimal -// fallbacks to keep compilation working on dav-c220. -// --------------------------------------------------------------------------- -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include - -// AICore printf support is gated behind `--cce-enable-print` on some -// toolchains. When enabled, include the CCE print header so `cce::printf` -// resolves in device compilation. -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -// Some PTO-ISA types are only available in the __CCE_AICORE__ compilation -// path, but `bisheng -xcce` still performs a host-side parse pass. -// Provide minimal fallbacks only when the corresponding header wasn't -// pulled in by the selected arch implementation. -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vstsx2_layout_check_kernel(__gm__ float *v1, - __gm__ float *v2); - -void LaunchVstsx2_layout_check_kernel(float *v1, float *v2, void *stream) { - vstsx2_layout_check_kernel<<<1, nullptr, stream>>>((__gm__ float *)v1, - (__gm__ float *)v2); -} diff --git a/test/vpto/cases/micro-op/vector-load-store/vstsx2-layout-check/main.cpp b/test/vpto/cases/micro-op/vector-load-store/vstsx2-layout-check/main.cpp deleted file mode 100644 index 1e380b6b2..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vstsx2-layout-check/main.cpp +++ /dev/null @@ -1,130 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/vector-load-store/vstsx2-layout-check -// family: vector-load-store -// target_ops: pto.vstsx2 -// scenarios: core-f32, full-mask, paired-roundtrip, dintlv, lane-order -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -This program is free software, you can redistribute it and/or modify it under the terms and conditions of -CANN Open Software License Agreement Version 2.0 (the "License"). -Please refer to the License for details. You may not use this file except in compliance with the License. -THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -See LICENSE in the root of the software repository for the full text of the License. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include -#include - -using namespace PtoTestCommon; - -#ifndef TMRGSORT_HPP -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, (int)_ret, __FILE__, __LINE__); \ - const char *_recent = aclGetRecentErrMsg(); \ - if (_recent != nullptr && _recent[0] != '\0') { \ - std::fprintf(stderr, "[ERROR] RecentErrMsg: %s\n", _recent); \ - } \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVstsx2_layout_check_kernel(float *v1, float *v2, void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(float); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(float); - float *v1Host = nullptr; - float *v1Device = nullptr; - float *v2Host = nullptr; - float *v2Device = nullptr; - - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { - deviceId = std::atoi(envDevice); - } - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVstsx2_layout_check_kernel(v1Device, v2Device, stream); - - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v2Host, fileSize_v2, v2Device, fileSize_v2, ACL_MEMCPY_DEVICE_TO_HOST)); - - WriteFile("./v2.bin", v2Host, fileSize_v2); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - if (stream != nullptr) { - const aclError _ret = aclrtDestroyStream(stream); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtDestroyStream(stream)", (int)_ret, __FILE__, __LINE__); - } - stream = nullptr; - } - if (deviceSet) { - const aclError _ret = aclrtResetDevice(deviceId); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtResetDevice(deviceId)", (int)_ret, __FILE__, __LINE__); - } - } - if (aclInited) { - const aclError _ret = aclFinalize(); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclFinalize()", (int)_ret, __FILE__, __LINE__); - } - } - - return rc; -} diff --git a/test/vpto/cases/micro-op/vector-load-store/vstur-init-align-outside-loop/compare.py b/test/vpto/cases/micro-op/vector-load-store/vstur-init-align-outside-loop/compare.py deleted file mode 100644 index fde3a5229..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vstur-init-align-outside-loop/compare.py +++ /dev/null @@ -1,112 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/vector-load-store/vstur-init-align-outside-loop -# family: vector-load-store -# target_ops: pto.vstur -# scenarios: core-f32, full-mask, unaligned, state-update, init-align-outside-loop -# coding=utf-8 - -import os -import sys -import numpy as np - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - abs_diff = np.abs(g - o) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={g[idx]}, out={o[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_bin_window(golden_path, output_path, dtype, eps, offset, count): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - try: - offset = int(offset) - count = int(count) - except Exception: - print(f"[ERROR] Invalid compare window: offset={offset} count={count}") - return False - if offset < 0 or count <= 0: - print(f"[ERROR] Invalid compare window: offset={offset} count={count}") - return False - - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - end = offset + count - if golden.size < end or output.size < end: - print( - f"[ERROR] Compare window out of range: offset={offset} count={count}, " - f"golden={golden.size}, out={output.size}" - ) - return False - - golden_sel = golden[offset:end] - output_sel = output[offset:end] - if not np.allclose(golden_sel, output_sel, atol=eps, rtol=eps, equal_nan=True): - if golden_sel.size: - g = golden_sel.astype(np.float64, copy=False) - o = output_sel.astype(np.float64, copy=False) - abs_diff = np.abs(g - o) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch (window): {golden_path} vs {output_path}, max diff={diff} " - f"at idx={offset + idx} (golden={g[idx]}, out={o[idx]}, dtype={dtype_np}, " - f"offset={offset}, count={count})" - ) - else: - print(f"[ERROR] Mismatch (window): {golden_path} vs {output_path}, empty window, dtype={dtype_np}") - return False - return True - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = compare_bin_window("golden_v2.bin", "v2.bin", np.float32, 0.0001, 1, 8) - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/vector-load-store/vstur-init-align-outside-loop/golden.py b/test/vpto/cases/micro-op/vector-load-store/vstur-init-align-outside-loop/golden.py deleted file mode 100644 index d13ca8097..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vstur-init-align-outside-loop/golden.py +++ /dev/null @@ -1,52 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/vector-load-store/vstur-init-align-outside-loop -# family: vector-load-store -# target_ops: pto.vstur -# scenarios: core-f32, predicate-squeezed, unaligned, state-update, init-align-outside-loop -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -ROWS = 32 -COLS = 32 -ACTIVE_LANES = 8 -SEED = 19 - - -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) - v1 = rng.uniform(-8.0, 8.0, size=(ROWS, COLS)).astype(np.float32) - v2 = np.zeros((ROWS, COLS), dtype=np.float32) - golden_v2 = np.zeros((ROWS, COLS), dtype=np.float32) - golden_v2.reshape(-1)[1 : 1 + ACTIVE_LANES] = v1.reshape(-1)[:ACTIVE_LANES] - - output_dir.mkdir(parents=True, exist_ok=True) - v1.reshape(-1).tofile(output_dir / "v1.bin") - v2.reshape(-1).tofile(output_dir / "v2.bin") - golden_v2.reshape(-1).tofile(output_dir / "golden_v2.bin") - - -def main() -> None: - parser = argparse.ArgumentParser( - description="Generate numpy-based inputs/golden for VPTO micro-op vstur-init-align-outside-loop validation." - ) - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/vector-load-store/vstur-init-align-outside-loop/kernel.pto b/test/vpto/cases/micro-op/vector-load-store/vstur-init-align-outside-loop/kernel.pto deleted file mode 100644 index 0b07a956c..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vstur-init-align-outside-loop/kernel.pto +++ /dev/null @@ -1,53 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/vector-load-store/vstur-init-align-outside-loop -// family: vector-load-store -// target_ops: pto.vstur -// scenarios: core-f32, predicate-squeezed, unaligned, state-update, init-align-outside-loop -// ----------------------------------------------------------------------------- -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vstur_init_align_outside_loop_kernel_2d(%arg0: !pto.ptr, %arg1: !pto.ptr) attributes {pto.kernel} { - %c1 = arith.constant 1 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c8_i32 = arith.constant 8 : i32 - %c32_i64 = arith.constant 32 : i64 - %c128_i64 = arith.constant 128 : i64 - %c4096_i64 = arith.constant 4096 : i64 - %c0 = arith.constant 0 : index - - %ub_in = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c4096_i64 : i64 -> !pto.ptr - %ub_out1 = pto.addptr %ub_out, %c1 : !pto.ptr -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_in, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - pto.sprclr "AR" - %align0 = pto.init_align : !pto.align - %align_final = scf.for %offset = %c0 to %c1 step %c1 - iter_args(%align_iter = %align0) -> (!pto.align) { - %vec = pto.vlds %ub_in[%c0] : !pto.ptr -> !pto.vreg<64xf32> - %mask, %unused = pto.plt_b32 %c8_i32 : i32 -> !pto.mask, i32 - %sqz = pto.vsqz %vec, %mask : !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> - %align1 = pto.vstur %align_iter, %sqz, %ub_out1, "POST_UPDATE" - : !pto.align, !pto.vreg<64xf32>, !pto.ptr -> !pto.align - scf.yield %align1 : !pto.align - } - pto.vstar %align_final, %ub_out1 : !pto.align, !pto.ptr - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg1, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/vector-load-store/vstur-init-align-outside-loop/launch.cpp b/test/vpto/cases/micro-op/vector-load-store/vstur-init-align-outside-loop/launch.cpp deleted file mode 100644 index a56f83c6e..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vstur-init-align-outside-loop/launch.cpp +++ /dev/null @@ -1,54 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/vector-load-store/vstur-init-align-outside-loop -// family: vector-load-store -// target_ops: pto.vstur -// scenarios: core-f32, full-mask, unaligned, state-update, init-align-outside-loop -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include - -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void -vstur_init_align_outside_loop_kernel_2d(__gm__ float *v1, __gm__ float *v2); - -void LaunchVstur_init_align_outside_loop_kernel_2d(float *v1, float *v2, - void *stream) { - vstur_init_align_outside_loop_kernel_2d<<<1, nullptr, stream>>>( - (__gm__ float *)v1, (__gm__ float *)v2); -} diff --git a/test/vpto/cases/micro-op/vector-load-store/vstur-init-align-outside-loop/main.cpp b/test/vpto/cases/micro-op/vector-load-store/vstur-init-align-outside-loop/main.cpp deleted file mode 100644 index 486ca9862..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vstur-init-align-outside-loop/main.cpp +++ /dev/null @@ -1,129 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/vector-load-store/vstur-init-align-outside-loop -// family: vector-load-store -// target_ops: pto.vstur -// scenarios: core-f32, full-mask, unaligned, state-update, init-align-outside-loop -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -This program is free software, you can redistribute it and/or modify it under the terms and conditions of -CANN Open Software License Agreement Version 2.0 (the "License"). -Please refer to the License for details. You may not use this file except in compliance with the License. -THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -See LICENSE in the root of the software repository for the full text of the License. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include -#include - -using namespace PtoTestCommon; - -#ifndef TMRGSORT_HPP -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, (int)_ret, __FILE__, __LINE__); \ - const char *_recent = aclGetRecentErrMsg(); \ - if (_recent != nullptr && _recent[0] != '\0') { \ - std::fprintf(stderr, "[ERROR] RecentErrMsg: %s\n", _recent); \ - } \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVstur_init_align_outside_loop_kernel_2d(float *v1, float *v2, - void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(float); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(float); - float *v1Host = nullptr; - float *v1Device = nullptr; - float *v2Host = nullptr; - float *v2Device = nullptr; - - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { - deviceId = std::atoi(envDevice); - } - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVstur_init_align_outside_loop_kernel_2d(v1Device, v2Device, stream); - - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v2Host, fileSize_v2, v2Device, fileSize_v2, ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("./v2.bin", v2Host, fileSize_v2); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - if (stream != nullptr) { - const aclError _ret = aclrtDestroyStream(stream); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtDestroyStream(stream)", (int)_ret, __FILE__, __LINE__); - } - stream = nullptr; - } - if (deviceSet) { - const aclError _ret = aclrtResetDevice(deviceId); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtResetDevice(deviceId)", (int)_ret, __FILE__, __LINE__); - } - } - if (aclInited) { - const aclError _ret = aclFinalize(); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclFinalize()", (int)_ret, __FILE__, __LINE__); - } - } - return rc; -} diff --git a/test/vpto/cases/micro-op/vector-load-store/vstur/compare.py b/test/vpto/cases/micro-op/vector-load-store/vstur/compare.py deleted file mode 100755 index 80b4dab8a..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vstur/compare.py +++ /dev/null @@ -1,257 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/vector-load-store/vstur -# family: vector-load-store -# target_ops: pto.vstur -# scenarios: core-f32, full-mask, unaligned, state-update -# NOTE: bulk-generated coverage skeleton. -# coding=utf-8 - -import os -import sys -import numpy as np - - -REPEAT_BYTES = 256 - - -def _ceil_div(x, y): - return (x + y - 1) // y - - -def _packed_pred_storage_bytes(logical_elems, src_elem_bytes): - logical_elems = int(logical_elems) - src_elem_bytes = int(src_elem_bytes) - if logical_elems <= 0: - raise ValueError(f"logical_elems must be > 0, got {logical_elems}") - if src_elem_bytes not in (1, 2, 4): - raise ValueError(f"unsupported packed predicate source size: {src_elem_bytes}") - - repeat_elems = REPEAT_BYTES // src_elem_bytes - if src_elem_bytes == 4: - repeat_times = _ceil_div(logical_elems, repeat_elems) + 1 - loop_count = repeat_times // 2 - return loop_count * 16 - - repeat_times = _ceil_div(logical_elems, repeat_elems) - return repeat_times * (repeat_elems // 8) - - -def compare_bin(golden_path, output_path, dtype, eps): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - if golden.shape != output.shape: - print(f"[ERROR] Shape mismatch: {golden_path} {golden.shape} vs {output_path} {output.shape}") - return False - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - g = golden.astype(np.int64, copy=False) - o = output.astype(np.int64, copy=False) - else: - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - abs_diff = np.abs(g - o) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch: {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={g[idx]}, out={o[idx]}, dtype={dtype_np})" - ) - else: - print(f"[ERROR] Mismatch: {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_bin_prefix(golden_path, output_path, dtype, eps, count): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - try: - count = int(count) - except Exception: - print(f"[ERROR] Invalid prefix count: {count}") - return False - if count <= 0: - print(f"[ERROR] Invalid prefix count: {count}") - return False - - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np, count=count) - output = np.fromfile(output_path, dtype=dtype_np, count=count) - - if golden.size != count or output.size != count: - print( - f"[ERROR] Prefix read too small: need={count} elems, " - f"golden={golden.size}, out={output.size}" - ) - return False - - if not np.allclose(golden, output, atol=eps, rtol=eps, equal_nan=True): - if golden.size: - if np.issubdtype(dtype_np, np.floating): - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - elif np.issubdtype(dtype_np, np.integer) or np.issubdtype(dtype_np, np.unsignedinteger): - g = golden.astype(np.int64, copy=False) - o = output.astype(np.int64, copy=False) - else: - g = golden.astype(np.float64, copy=False) - o = output.astype(np.float64, copy=False) - abs_diff = np.abs(g - o) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch (prefix): {golden_path} vs {output_path}, max diff={diff} at idx={idx} " - f"(golden={g[idx]}, out={o[idx]}, dtype={dtype_np}, count={count})" - ) - else: - print(f"[ERROR] Mismatch (prefix): {golden_path} vs {output_path}, empty buffers, dtype={dtype_np}") - return False - return True - - -def compare_bin_window(golden_path, output_path, dtype, eps, offset, count): - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - try: - offset = int(offset) - count = int(count) - except Exception: - print(f"[ERROR] Invalid compare window: offset={offset} count={count}") - return False - if offset < 0 or count <= 0: - print(f"[ERROR] Invalid compare window: offset={offset} count={count}") - return False - - dtype_np = np.dtype(dtype) - golden = np.fromfile(golden_path, dtype=dtype_np) - output = np.fromfile(output_path, dtype=dtype_np) - end = offset + count - if golden.size < end or output.size < end: - print( - f"[ERROR] Compare window out of range: offset={offset} count={count}, " - f"golden={golden.size}, out={output.size}" - ) - return False - - golden_sel = golden[offset:end] - output_sel = output[offset:end] - if not np.allclose(golden_sel, output_sel, atol=eps, rtol=eps, equal_nan=True): - if golden_sel.size: - g = golden_sel.astype(np.float64, copy=False) - o = output_sel.astype(np.float64, copy=False) - abs_diff = np.abs(g - o) - idx = int(np.argmax(abs_diff)) - diff = float(abs_diff[idx]) - print( - f"[ERROR] Mismatch (window): {golden_path} vs {output_path}, max diff={diff} " - f"at idx={offset + idx} (golden={g[idx]}, out={o[idx]}, dtype={dtype_np}, " - f"offset={offset}, count={count})" - ) - else: - print(f"[ERROR] Mismatch (window): {golden_path} vs {output_path}, empty window, dtype={dtype_np}") - return False - return True - - -def compare_packed_pred_mask(golden_path, output_path, logical_elems, src_elem_bytes): - """ - Compare outputs of pto.tcmp / pto.tcmps. - - PTO-ISA stores packed predicate results as a linear PK byte stream via - `psts`, with the exact written prefix length determined by the typed - TCMP/TCMPS repeat schedule. Compare only that semantic prefix. - """ - if not os.path.exists(output_path): - print(f"[ERROR] Output missing: {output_path}") - return False - if not os.path.exists(golden_path): - print(f"[ERROR] Golden missing: {golden_path}") - return False - try: - logical_elems = int(logical_elems) - src_elem_bytes = int(src_elem_bytes) - except Exception: - print( - "[ERROR] Invalid packed mask compare arguments: " - f"logical_elems={logical_elems} src_elem_bytes={src_elem_bytes}" - ) - return False - if logical_elems <= 0 or src_elem_bytes <= 0: - print( - "[ERROR] Invalid packed mask compare arguments: " - f"logical_elems={logical_elems} src_elem_bytes={src_elem_bytes}" - ) - return False - - golden = np.fromfile(golden_path, dtype=np.uint8) - output = np.fromfile(output_path, dtype=np.uint8) - try: - prefix_bytes = _packed_pred_storage_bytes(logical_elems, src_elem_bytes) - except ValueError as exc: - print(f"[ERROR] {exc}") - return False - - if golden.size < prefix_bytes or output.size < prefix_bytes: - print( - f"[ERROR] Packed mask buffer too small: need={prefix_bytes} bytes, " - f"golden={golden.size}, out={output.size}" - ) - return False - - golden_sel = golden[:prefix_bytes] - output_sel = output[:prefix_bytes] - - if not np.array_equal(golden_sel, output_sel): - diff = np.nonzero(golden_sel != output_sel)[0] - idx = int(diff[0]) if diff.size else 0 - print( - f"[ERROR] Mismatch (packed mask): {golden_path} vs {output_path}, first diff at idx={idx} " - f"(golden={int(golden_sel[idx])}, out={int(output_sel[idx])}, " - f"logical_elems={logical_elems}, src_elem_bytes={src_elem_bytes}, prefix_bytes={prefix_bytes})" - ) - return False - return True - - -def main(): - strict = os.getenv("COMPARE_STRICT", "1") != "0" - ok = True - ok = compare_bin_window("golden_v2.bin", "v2.bin", np.float32, 0.0001, 1, 8) and ok - if not ok: - if strict: - print("[ERROR] compare failed") - sys.exit(2) - print("[WARN] compare failed (non-gating)") - return - print("[INFO] compare passed") - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/vector-load-store/vstur/golden.py b/test/vpto/cases/micro-op/vector-load-store/vstur/golden.py deleted file mode 100755 index 96b3c4030..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vstur/golden.py +++ /dev/null @@ -1,52 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2026 Huawei Technologies Co., Ltd. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. - -# case: micro-op/vector-load-store/vstur -# family: vector-load-store -# target_ops: pto.vstur -# scenarios: core-f32, predicate-squeezed, unaligned, state-update -# coding=utf-8 - -import argparse -from pathlib import Path - -import numpy as np - - -ROWS = 32 -COLS = 32 -ACTIVE_LANES = 8 -SEED = 19 - - -def generate(output_dir: Path, seed: int) -> None: - rng = np.random.default_rng(seed) - v1 = rng.uniform(-8.0, 8.0, size=(ROWS, COLS)).astype(np.float32) - v2 = np.zeros((ROWS, COLS), dtype=np.float32) - golden_v2 = np.zeros((ROWS, COLS), dtype=np.float32) - golden_v2.reshape(-1)[1 : 1 + ACTIVE_LANES] = v1.reshape(-1)[:ACTIVE_LANES] - - output_dir.mkdir(parents=True, exist_ok=True) - v1.reshape(-1).tofile(output_dir / "v1.bin") - v2.reshape(-1).tofile(output_dir / "v2.bin") - golden_v2.reshape(-1).tofile(output_dir / "golden_v2.bin") - - -def main() -> None: - parser = argparse.ArgumentParser( - description="Generate numpy-based inputs/golden for VPTO micro-op vstur validation." - ) - parser.add_argument("--output-dir", type=Path, default=Path(".")) - parser.add_argument("--seed", type=int, default=SEED) - args = parser.parse_args() - generate(args.output_dir, args.seed) - - -if __name__ == "__main__": - main() diff --git a/test/vpto/cases/micro-op/vector-load-store/vstur/kernel.pto b/test/vpto/cases/micro-op/vector-load-store/vstur/kernel.pto deleted file mode 100644 index 52ffc13c3..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vstur/kernel.pto +++ /dev/null @@ -1,61 +0,0 @@ -// ----------------------------------------------------------------------------- -// case: micro-op/vector-load-store/vstur -// family: vector-load-store -// target_ops: pto.vstur -// scenarios: core-f32, predicate-squeezed, unaligned, state-update -// ----------------------------------------------------------------------------- -// Validate the standalone `vstur` surface with its required SQZN producer. -// The case keeps the sequence minimal: -// 1. load one vector from `%ub_in` -// 2. generate a small predicate and squeeze the vector to prime `SPR SQZN` -// 3. prime one store-state carrier from `%ub_out` -// 4. issue one `pto.vstur ... "POST_UPDATE"` -// 5. flush the residual state with `pto.vstar` -// This preserves the testcase goal around unaligned store state update without -// fabricating extra semantics beyond the installed A5 wrapper contract. - -module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { - func.func @vstur_kernel_2d(%arg0: !pto.ptr, %arg1: !pto.ptr) attributes {pto.kernel} { - %c1 = arith.constant 1 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c8_i32 = arith.constant 8 : i32 - %c32_i64 = arith.constant 32 : i64 - %c128_i64 = arith.constant 128 : i64 - %c4096_i64 = arith.constant 4096 : i64 - %c0 = arith.constant 0 : index - - %ub_in = pto.castptr %c0_i64 : i64 -> !pto.ptr - %ub_out = pto.castptr %c4096_i64 : i64 -> !pto.ptr - %ub_out1 = pto.addptr %ub_out, %c1 : !pto.ptr -> !pto.ptr - - %false = arith.constant false - pto.mte_gm_ub %arg0, %ub_in, %c0_i64, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64, i64 - - pto.set_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - pto.wait_flag["PIPE_MTE2", "PIPE_V", "EVENT_ID0"] - - pto.vecscope { - pto.sprclr "AR" - scf.for %offset = %c0 to %c1 step %c1 { - %vec = pto.vlds %ub_in[%c0] : !pto.ptr -> !pto.vreg<64xf32> - %mask, %unused = pto.plt_b32 %c8_i32 : i32 -> !pto.mask, i32 - %sqz = pto.vsqz %vec, %mask : !pto.vreg<64xf32>, !pto.mask -> !pto.vreg<64xf32> - %align0 = pto.init_align : !pto.align - %align1 = pto.vstur %align0, %sqz, %ub_out1, "POST_UPDATE" - : !pto.align, !pto.vreg<64xf32>, !pto.ptr -> !pto.align - pto.vstar %align1, %ub_out1 : !pto.align, !pto.ptr - } - } - - pto.set_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.wait_flag["PIPE_V", "PIPE_MTE3", "EVENT_ID0"] - pto.mte_ub_gm %ub_out, %arg1, %c128_i64 - nburst(%c32_i64, %c128_i64, %c128_i64) - : !pto.ptr, !pto.ptr, i64, i64, i64, i64 - pto.barrier #pto.pipe - return - } -} diff --git a/test/vpto/cases/micro-op/vector-load-store/vstur/launch.cpp b/test/vpto/cases/micro-op/vector-load-store/vstur/launch.cpp deleted file mode 100644 index b0c69d79c..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vstur/launch.cpp +++ /dev/null @@ -1,70 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/vector-load-store/vstur -// family: vector-load-store -// target_ops: pto.vstur -// scenarios: core-f32, full-mask, unaligned, state-update -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -// --------------------------------------------------------------------------- -// PTOAS compatibility layer -// -// The upstream pto-isa headers reference some FP8/FP4 types and the -// __VEC_SCOPE__ marker that are not available on every AICore arch/toolchain -// combination (e.g. __NPU_ARCH__==2201). -// -// For our PTOAS-generated kernels we don't rely on these types today, but the -// headers still mention them in templates/static_asserts. Provide minimal -// fallbacks to keep compilation working on dav-c220. -// --------------------------------------------------------------------------- -#ifndef __VEC_SCOPE__ -#define __VEC_SCOPE__ -#endif - -#if defined(__CCE_AICORE__) && defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201) -typedef struct { unsigned char v; } hifloat8_t; -typedef struct { unsigned char v; } float8_e4m3_t; -typedef struct { unsigned char v; } float8_e5m2_t; -typedef struct { unsigned char v; } float8_e8m0_t; -typedef struct { unsigned char v; } float4_e1m2x2_t; -typedef struct { unsigned char v; } float4_e2m1x2_t; -#endif -#include - -// AICore printf support is gated behind `--cce-enable-print` on some -// toolchains. When enabled, include the CCE print header so `cce::printf` -// resolves in device compilation. -#if defined(__CCE_AICORE__) && defined(PTOAS_ENABLE_CCE_PRINT) -#include -#endif - -// Some PTO-ISA types are only available in the __CCE_AICORE__ compilation -// path, but `bisheng -xcce` still performs a host-side parse pass. -// Provide minimal fallbacks only when the corresponding header wasn't -// pulled in by the selected arch implementation. -#if !defined(__CCE_AICORE__) && !defined(TMRGSORT_HPP) -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif -#ifndef __CPU_SIM -#include "acl/acl.h" -#endif - -extern "C" __global__ [aicore] void vstur_kernel_2d(__gm__ float *v1, - __gm__ float *v2); - -void LaunchVstur_kernel_2d(float *v1, float *v2, void *stream) { - vstur_kernel_2d<<<1, nullptr, stream>>>((__gm__ float *)v1, (__gm__ float *)v2); -} diff --git a/test/vpto/cases/micro-op/vector-load-store/vstur/main.cpp b/test/vpto/cases/micro-op/vector-load-store/vstur/main.cpp deleted file mode 100644 index 273fb30c4..000000000 --- a/test/vpto/cases/micro-op/vector-load-store/vstur/main.cpp +++ /dev/null @@ -1,130 +0,0 @@ -// Copyright (c) 2026 Huawei Technologies Co., Ltd. -// This program is free software, you can redistribute it and/or modify it under the terms and conditions of -// CANN Open Software License Agreement Version 2.0 (the "License"). -// Please refer to the License for details. You may not use this file except in compliance with the License. -// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -// See LICENSE in the root of the software repository for the full text of the License. - -// ----------------------------------------------------------------------------- -// case: micro-op/vector-load-store/vstur -// family: vector-load-store -// target_ops: pto.vstur -// scenarios: core-f32, full-mask, unaligned, state-update -// NOTE: bulk-generated coverage skeleton. Parser/verifier/lowering failure is -// still a valid test conclusion in the current coverage-first phase. -// ----------------------------------------------------------------------------- -/** -Copyright (c) 2025 Huawei Technologies Co., Ltd. -This program is free software, you can redistribute it and/or modify it under the terms and conditions of -CANN Open Software License Agreement Version 2.0 (the "License"). -Please refer to the License for details. You may not use this file except in compliance with the License. -THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -See LICENSE in the root of the software repository for the full text of the License. -*/ - -#include "test_common.h" -#include "acl/acl.h" -#include -#include -#include - -using namespace PtoTestCommon; - -#ifndef TMRGSORT_HPP -struct MrgSortExecutedNumList { - uint16_t mrgSortList0; - uint16_t mrgSortList1; - uint16_t mrgSortList2; - uint16_t mrgSortList3; -}; -#endif - -#define ACL_CHECK(expr) \ - do { \ - const aclError _ret = (expr); \ - if (_ret != ACL_SUCCESS) { \ - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, (int)_ret, __FILE__, __LINE__); \ - const char *_recent = aclGetRecentErrMsg(); \ - if (_recent != nullptr && _recent[0] != '\0') { \ - std::fprintf(stderr, "[ERROR] RecentErrMsg: %s\n", _recent); \ - } \ - rc = 1; \ - goto cleanup; \ - } \ - } while (0) - -void LaunchVstur_kernel_2d(float *v1, float *v2, void *stream); - -int main() { - size_t elemCount_v1 = 1024; - size_t fileSize_v1 = elemCount_v1 * sizeof(float); - size_t elemCount_v2 = 1024; - size_t fileSize_v2 = elemCount_v2 * sizeof(float); - float *v1Host = nullptr; - float *v1Device = nullptr; - float *v2Host = nullptr; - float *v2Device = nullptr; - - int rc = 0; - bool aclInited = false; - bool deviceSet = false; - int deviceId = 0; - aclrtStream stream = nullptr; - - ACL_CHECK(aclInit(nullptr)); - aclInited = true; - if (const char *envDevice = std::getenv("ACL_DEVICE_ID")) { - deviceId = std::atoi(envDevice); - } - ACL_CHECK(aclrtSetDevice(deviceId)); - deviceSet = true; - ACL_CHECK(aclrtCreateStream(&stream)); - - ACL_CHECK(aclrtMallocHost((void **)(&v1Host), fileSize_v1)); - ACL_CHECK(aclrtMallocHost((void **)(&v2Host), fileSize_v2)); - ACL_CHECK(aclrtMalloc((void **)&v1Device, fileSize_v1, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc((void **)&v2Device, fileSize_v2, ACL_MEM_MALLOC_HUGE_FIRST)); - - ReadFile("./v1.bin", fileSize_v1, v1Host, fileSize_v1); - ReadFile("./v2.bin", fileSize_v2, v2Host, fileSize_v2); - ACL_CHECK(aclrtMemcpy(v1Device, fileSize_v1, v1Host, fileSize_v1, ACL_MEMCPY_HOST_TO_DEVICE)); - ACL_CHECK(aclrtMemcpy(v2Device, fileSize_v2, v2Host, fileSize_v2, ACL_MEMCPY_HOST_TO_DEVICE)); - LaunchVstur_kernel_2d(v1Device, v2Device, stream); - - ACL_CHECK(aclrtSynchronizeStream(stream)); - ACL_CHECK(aclrtMemcpy(v2Host, fileSize_v2, v2Device, fileSize_v2, ACL_MEMCPY_DEVICE_TO_HOST)); - - WriteFile("./v2.bin", v2Host, fileSize_v2); - -cleanup: - aclrtFree(v1Device); - aclrtFree(v2Device); - aclrtFreeHost(v1Host); - aclrtFreeHost(v2Host); - if (stream != nullptr) { - const aclError _ret = aclrtDestroyStream(stream); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtDestroyStream(stream)", (int)_ret, __FILE__, __LINE__); - } - stream = nullptr; - } - if (deviceSet) { - const aclError _ret = aclrtResetDevice(deviceId); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclrtResetDevice(deviceId)", (int)_ret, __FILE__, __LINE__); - } - } - if (aclInited) { - const aclError _ret = aclFinalize(); - if (_ret != ACL_SUCCESS) { - std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", - "aclFinalize()", (int)_ret, __FILE__, __LINE__); - } - } - - return rc; -} diff --git a/test/vpto/scripts/run_host_vpto_validation_parallel.sh b/test/vpto/scripts/run_host_vpto_validation_parallel.sh index a3b9a6bdb..98be7d669 100755 --- a/test/vpto/scripts/run_host_vpto_validation_parallel.sh +++ b/test/vpto/scripts/run_host_vpto_validation_parallel.sh @@ -168,6 +168,9 @@ log "CASE_NAME=${CASE_NAME:-}" | tee -a "${RUNNER_LOG}" log "CASE_PREFIX=${CASE_PREFIX:-}" | tee -a "${RUNNER_LOG}" log "JOBS=${JOBS}" | tee -a "${RUNNER_LOG}" log "TOTAL_CASES=${#CASES[@]}" | tee -a "${RUNNER_LOG}" +if [[ -n "${SIM_LIB_DIR:-}" ]]; then + log "SIM_LIB_DIR=${SIM_LIB_DIR}" | tee -a "${RUNNER_LOG}" +fi next_index=0 while [[ "${next_index}" -lt "${#CASES[@]}" || "${#PID_TO_CASE[@]}" -gt 0 ]]; do